diff --git a/.github/scripts/e2e-test-server-linux-and-mac.sh b/.github/scripts/e2e-test-server-linux-and-mac.sh
old mode 100644
new mode 100755
diff --git a/.github/scripts/e2e-test-server-windows.bat b/.github/scripts/e2e-test-server-windows.bat
index d1bf785..e945ff8 100644
--- a/.github/scripts/e2e-test-server-windows.bat
+++ b/.github/scripts/e2e-test-server-windows.bat
@@ -20,7 +20,6 @@ echo BINARY_NAME=%BINARY_NAME%
 
 del %TEMP%\response1.log 2>nul
 del %TEMP%\response2.log 2>nul
-del %TEMP%\response3.log 2>nul
 del %TEMP%\response4.log 2>nul
 del %TEMP%\response5.log 2>nul
 del %TEMP%\response6.log 2>nul
@@ -65,18 +64,18 @@ call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%"
 call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%"
 set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
 set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
-set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
 set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
 set "curl_data5={}"
 set "curl_data6={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
+@REM set "curl_data7={\"model\": \"test-embedding\"}"
 
 rem Print the values of curl_data for debugging
 echo curl_data1=%curl_data1%
 echo curl_data2=%curl_data2%
-echo curl_data3=%curl_data3%
 echo curl_data4=%curl_data4%
 echo curl_data5=%curl_data5%
 echo curl_data6=%curl_data6%
+@REM echo curl_data7=%curl_data7%
 
 rem Run the curl commands and capture the status code
 curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
@@ -85,8 +84,6 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --
 --header "Content-Type: application/json" ^
 --data "%curl_data2%" > %TEMP%\response2.log 2>&1
 
-curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1
-
 curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1
 
 curl.exe --connect-timeout 60 -o "%TEMP%\response5.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/models" --header "Content-Type: application/json" --data "%curl_data5%" > %TEMP%\response5.log 2>&1
@@ -100,7 +97,6 @@ set "error_occurred=0"
 rem Read the status codes from the log files
 for /f %%a in (%TEMP%\response1.log) do set "response1=%%a"
 for /f %%a in (%TEMP%\response2.log) do set "response2=%%a"
-for /f %%a in (%TEMP%\response3.log) do set "response3=%%a"
 for /f %%a in (%TEMP%\response4.log) do set "response4=%%a"
 for /f %%a in (%TEMP%\response5.log) do set "response5=%%a"
 for /f %%a in (%TEMP%\response6.log) do set "response6=%%a"
@@ -117,12 +113,6 @@ if "%response2%" neq "200" (
     set "error_occurred=1"
 )
 
-if "%response3%" neq "200" (
-    echo The third curl command failed with status code: %response3%
-    type %TEMP%\response3.log
-    set "error_occurred=1"
-)
-
 if "%response4%" neq "200" (
     echo The fourth curl command failed with status code: %response4%
     type %TEMP%\response4.log
@@ -158,10 +148,6 @@ echo ----------------------
 echo Log run test:
 type %TEMP%\response2.log
 
-echo ----------------------
-echo Log unload model:
-type %TEMP%\response3.log
-
 echo ----------------------
 echo Log load embedding model:
 type %TEMP%\response4.log
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index af008b2..46ed384 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -65,7 +65,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
@@ -73,7 +73,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
@@ -81,7 +81,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -89,7 +89,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -97,7 +97,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx512"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -105,7 +105,7 @@ jobs:
           - os: "linux"
             name: "amd64-vulkan"
             runs-on: "ubuntu-22-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: true
             ccache: true
@@ -113,7 +113,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -121,7 +121,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -129,7 +129,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -137,7 +137,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -145,7 +145,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -153,7 +153,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -161,7 +161,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -169,7 +169,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -177,7 +177,7 @@ jobs:
           - os: "mac"
             name: "amd64"
             runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -185,7 +185,7 @@ jobs:
           - os: "mac"
             name: "arm64"
             runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -193,7 +193,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx2"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -201,7 +201,7 @@ jobs:
           - os: "windows"
             name: "amd64-noavx"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -209,7 +209,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -217,7 +217,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx512"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -225,7 +225,7 @@ jobs:
           - os: "windows"
             name: "amd64-vulkan"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: true
             ccache: false
@@ -233,7 +233,7 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -241,7 +241,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -249,7 +249,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -257,7 +257,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -265,7 +265,7 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -273,7 +273,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -281,7 +281,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -289,7 +289,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -305,7 +305,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: use python for linux
         continue-on-error: true
diff --git a/.github/workflows/convert-model-all-quant.yml b/.github/workflows/convert-model-all-quant.yml
index 0899542..ccfdaaa 100644
--- a/.github/workflows/convert-model-all-quant.yml
+++ b/.github/workflows/convert-model-all-quant.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: Set up Python
         uses: actions/setup-python@v5 # v5.1.1
diff --git a/.github/workflows/create-pr-sync-remote.yml b/.github/workflows/create-pr-sync-remote.yml
index 696214f..d2293ab 100644
--- a/.github/workflows/create-pr-sync-remote.yml
+++ b/.github/workflows/create-pr-sync-remote.yml
@@ -47,4 +47,4 @@ jobs:
     - name: Apply patch file
       run: |
         cd llama.cpp
-        git apply ../patches/0001-Add-API-query-buffer-size.patch
\ No newline at end of file
+        git apply ../patches/0002-Build-llama-cpp-examples.patch
\ No newline at end of file
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index d21b5c3..2b8397a 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -64,7 +64,8 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
@@ -72,15 +73,17 @@ jobs:
           - os: "linux"
             name: "amd64-avx2"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "amd64-noavx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -88,7 +91,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -96,7 +100,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx512"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -104,7 +109,8 @@ jobs:
           - os: "linux"
             name: "amd64-vulkan"
             runs-on: "ubuntu-22-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: true
             ccache: true
@@ -112,7 +118,8 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -120,7 +127,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -128,7 +136,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -136,7 +145,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -144,7 +154,8 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -152,7 +163,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -160,7 +172,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -168,7 +181,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -176,7 +190,8 @@ jobs:
           - os: "mac"
             name: "amd64"
             runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
+            cmake-lib-flags: ""
             run-e2e: true
             vulkan: false
             ccache: false
@@ -184,7 +199,8 @@ jobs:
           - os: "mac"
             name: "arm64"
             runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-lib-flags: ""
             run-e2e: false
             vulkan: false
             ccache: false
@@ -192,15 +208,17 @@ jobs:
           - os: "windows"
             name: "amd64-avx2"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
             vulkan: false
             ccache: false
             ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
           - os: "windows"
             name: "amd64-noavx"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -208,15 +226,17 @@ jobs:
           - os: "windows"
             name: "amd64-avx"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
             vulkan: false
             ccache: false
             ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
           - os: "windows"
             name: "amd64-avx512"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -224,7 +244,8 @@ jobs:
           - os: "windows"
             name: "amd64-vulkan"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: true
             ccache: false
@@ -232,7 +253,8 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -240,7 +262,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -248,7 +271,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -256,7 +280,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -264,7 +289,8 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -272,7 +298,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -280,7 +307,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -288,7 +316,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -304,7 +333,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: use python for linux
         continue-on-error: true
@@ -403,7 +432,7 @@ jobs:
       - name: Build
         id: build-and-test
         run: |
-          make build-example-server CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+          make build-example-server CMAKE_SERVER_FLAGS="${{ matrix.cmake-flags }}" CMAKE_LIB_FLAGS="${{ matrix.cmake-lib-flags }}" 
 
       - name: Pre Package
         run: |
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index efda987..65ff919 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -85,7 +85,7 @@ jobs:
     - name: Apply patch file
       run: |
         cd llama.cpp
-        git apply ../patches/0001-Add-API-query-buffer-size.patch
+        git apply ../patches/0002-Build-llama-cpp-examples.patch
 
     - name: Wait for CI to pass
       env:
@@ -133,7 +133,7 @@ jobs:
     - name: Apply patch file
       run: |
         cd llama.cpp
-        git apply ../patches/0001-Add-API-query-buffer-size.patch
+        git apply ../patches/0002-Build-llama-cpp-examples.patch
 
     - name: Configure Git
       run: |
diff --git a/.github/workflows/template-e2e-weekend-test.yml b/.github/workflows/template-e2e-weekend-test.yml
index 1cb5778..a7d1b67 100644
--- a/.github/workflows/template-e2e-weekend-test.yml
+++ b/.github/workflows/template-e2e-weekend-test.yml
@@ -91,7 +91,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: Set up Python
         uses: actions/setup-python@v5 # v5.1.1
diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml
index 4af2e5b..f9042bb 100644
--- a/.github/workflows/template-quality-gate-pr.yml
+++ b/.github/workflows/template-quality-gate-pr.yml
@@ -31,23 +31,26 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "amd64-avx2"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"            
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"            
+            run-e2e: false
             vulkan: false
             ccache: true
             ccache-dir: "/home/runner/.ccache"
           - os: "linux"
             name: "amd64-noavx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -55,7 +58,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -63,7 +67,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx512"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -71,7 +76,8 @@ jobs:
           - os: "linux"
             name: "amd64-vulkan"
             runs-on: "ubuntu-22-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: true
             ccache: true
@@ -79,7 +85,8 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -87,7 +94,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -95,7 +103,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -103,7 +112,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -111,7 +121,8 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -119,7 +130,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"            
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -127,7 +139,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -135,7 +148,8 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -143,7 +157,8 @@ jobs:
           - os: "mac"
             name: "amd64"
             runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
+            cmake-lib-flags: ""
             run-e2e: true
             vulkan: false
             ccache: false
@@ -151,7 +166,8 @@ jobs:
           - os: "mac"
             name: "arm64"
             runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-lib-flags: ""
             run-e2e: true
             vulkan: false
             ccache: false
@@ -159,15 +175,17 @@ jobs:
           - os: "windows"
             name: "amd64-avx2"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
             vulkan: false
             ccache: false
             ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
           - os: "windows"
             name: "amd64-noavx"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -175,15 +193,17 @@ jobs:
           - os: "windows"
             name: "amd64-avx"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
-            run-e2e: true
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            run-e2e: false
             vulkan: false
             ccache: false
             ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
           - os: "windows"
             name: "amd64-avx512"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -191,7 +211,8 @@ jobs:
           - os: "windows"
             name: "amd64-vulkan"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: true
             ccache: false
@@ -199,7 +220,8 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -207,7 +229,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -215,7 +238,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -223,7 +247,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -231,7 +256,8 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -239,7 +265,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -247,7 +274,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -255,7 +283,8 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-lib-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -271,7 +300,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: use python for linux
         continue-on-error: true
@@ -352,7 +381,7 @@ jobs:
       - name: Build
         id: build-and-test
         run: |
-          make build-example-server CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+          make build-example-server CMAKE_SERVER_FLAGS="${{ matrix.cmake-flags }}" CMAKE_LIB_FLAGS="${{ matrix.cmake-lib-flags }}" 
 
       - name: Pre Package
         run: |
diff --git a/.github/workflows/template-quality-gate-submodule.yml b/.github/workflows/template-quality-gate-submodule.yml
index 11a8d43..264a66b 100644
--- a/.github/workflows/template-quality-gate-submodule.yml
+++ b/.github/workflows/template-quality-gate-submodule.yml
@@ -31,7 +31,7 @@ jobs:
           - os: "linux"
             name: "arm64"
             runs-on: "ubuntu-2004-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
@@ -39,7 +39,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: true
             vulkan: false
             ccache: true
@@ -47,7 +47,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -55,7 +55,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -63,7 +63,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx512"
             runs-on: "ubuntu-20-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -71,7 +71,7 @@ jobs:
           - os: "linux"
             name: "amd64-vulkan"
             runs-on: "ubuntu-22-04"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: true
             ccache: true
@@ -79,7 +79,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -87,7 +87,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -95,7 +95,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -103,7 +103,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -111,7 +111,7 @@ jobs:
           - os: "linux"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -119,7 +119,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -127,7 +127,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -135,7 +135,7 @@ jobs:
           - os: "linux"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -143,7 +143,7 @@ jobs:
           - os: "mac"
             name: "amd64"
             runs-on: "macos-selfhosted-12"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL=OFF -DGGML_NATIVE=OFF"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -151,7 +151,7 @@ jobs:
           - os: "mac"
             name: "arm64"
             runs-on: "macos-selfhosted-12-arm64"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_METAL_EMBED_LIBRARY=ON"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -159,7 +159,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx2"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -167,7 +167,7 @@ jobs:
           - os: "windows"
             name: "amd64-noavx"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -175,7 +175,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: true
             vulkan: false
             ccache: false
@@ -183,7 +183,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx512"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: false
             ccache: false
@@ -191,7 +191,7 @@ jobs:
           - os: "windows"
             name: "amd64-vulkan"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -GNinja"
             run-e2e: false
             vulkan: true
             ccache: false
@@ -199,7 +199,7 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -207,7 +207,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -215,7 +215,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -223,7 +223,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-12-0"
             runs-on: "windows-cuda-12-0"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -231,7 +231,7 @@ jobs:
           - os: "windows"
             name: "amd64-noavx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -239,7 +239,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -247,7 +247,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -255,7 +255,7 @@ jobs:
           - os: "windows"
             name: "amd64-avx512-cuda-11-7"
             runs-on: "windows-cuda-11-7"
-            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DLLAMA_BUILD_EXAMPLES=ON -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=ON -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
             run-e2e: false
             vulkan: false
             ccache: true
@@ -271,7 +271,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: use python for linux
         continue-on-error: true
diff --git a/.github/workflows/update-model-yml.yml b/.github/workflows/update-model-yml.yml
index 23a8e41..4ad8722 100644
--- a/.github/workflows/update-model-yml.yml
+++ b/.github/workflows/update-model-yml.yml
@@ -40,7 +40,7 @@ jobs:
       - name: Apply patch file
         run: |
           cd llama.cpp
-          git apply ../patches/0001-Add-API-query-buffer-size.patch
+          git apply ../patches/0002-Build-llama-cpp-examples.patch
 
       - name: Set up Python
         uses: actions/setup-python@v5
diff --git a/Makefile b/Makefile
index a91b73d..52fd0bf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
 # Makefile for Cortex llamacpp engine - Build, Lint, Test, and Clean
 
-CMAKE_EXTRA_FLAGS ?= ""
+CMAKE_SERVER_FLAGS ?= ""
+CMAKE_LIB_FLAGS ?= ""
 RUN_TESTS ?= false
 LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
 EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf"
@@ -21,29 +22,34 @@ build-lib:
 ifeq ($(OS),Windows_NT)
 	@powershell -Command "cmake -S ./third-party -B ./build_deps/third-party -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER=cl -DCMAKE_C_COMPILER=cl -DCMAKE_BUILD_TYPE=Release -GNinja;"
 	@powershell -Command "cmake --build ./build_deps/third-party --config Release -j4;"
-	@powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release;"
+	@powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_SERVER_FLAGS); cmake --build . --config Release;"
+	@powershell -Command "cd build; cp bin/llama-server.exe bin/llama-server-cp.exe; rm -r CMakeFiles; rm -r CMakeCache.txt; cmake .. $(CMAKE_LIB_FLAGS); cmake --build . --config Release;"
 else ifeq ($(shell uname -s),Linux)
 	@cmake -S ./third-party -B ./build_deps/third-party;
 	@make -C ./build_deps/third-party -j4;
 	@rm -rf ./build_deps/third-party;
 	@mkdir build && cd build; \
-	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	cmake .. $(CMAKE_SERVER_FLAGS); \
+	cmake --build . --config Release --parallel 4; \
+	cp bin/llama-server bin/llama-server-cp; \
+	rm -rf CMakeFiles CMakeCache.txt; \
+	cmake .. $(CMAKE_LIB_FLAGS); \
 	cmake --build . --config Release --parallel 4;
 else
 	@cmake -S ./third-party -B ./build_deps/third-party
 	@make -C ./build_deps/third-party -j4
 	@rm -rf ./build_deps/third-party
 	@mkdir build && cd build; \
-	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	cmake .. $(CMAKE_SERVER_FLAGS); \
 	make -j4;
 endif
 
 build-example-server: build-lib
 ifeq ($(OS),Windows_NT)
-	@powershell -Command "mkdir -p .\examples\server\build; cd .\examples\server\build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release;"
+	@powershell -Command "mkdir -p .\examples\server\build; cd .\examples\server\build; cmake .. $(CMAKE_SERVER_FLAGS); cmake --build . --config Release;"
 else ifeq ($(shell uname -s),Linux)
 	@mkdir -p examples/server/build && cd examples/server/build; \
-	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	cmake .. $(CMAKE_SERVER_FLAGS); \
 	cmake --build . --config Release;
 else
 	@mkdir -p examples/server/build && cd examples/server/build; \
@@ -54,15 +60,18 @@ endif
 pre-package:
 ifeq ($(OS),Windows_NT)
 	@powershell -Command "mkdir -p cortex.llamacpp; cp build\engine.dll cortex.llamacpp\;"
+	@powershell -Command "cp build\bin\llama-server-cp.exe cortex.llamacpp\llama-server.exe;"
 	@powershell -Command "cp .\.github\patches\windows\msvcp140.dll cortex.llamacpp\;"
 	@powershell -Command "cp .\.github\patches\windows\vcruntime140_1.dll cortex.llamacpp\;"
 	@powershell -Command "cp .\.github\patches\windows\vcruntime140.dll cortex.llamacpp\;"
 	@powershell -Command "cp .\.github\patches\windows\vcomp140.dll cortex.llamacpp\;"
 else ifeq ($(shell uname -s),Linux)
 	@mkdir -p cortex.llamacpp; \
+	cp build/bin/llama-server-cp cortex.llamacpp/llama-server; \
 	cp build/libengine.so cortex.llamacpp/;
 else
 	@mkdir -p cortex.llamacpp; \
+	cp build/bin/llama-server cortex.llamacpp/; \
 	cp build/libengine.dylib cortex.llamacpp/;
 endif
 
@@ -97,16 +106,18 @@ ifeq ($(RUN_TESTS),false)
 	@exit 0
 endif
 ifeq ($(OS),Windows_NT)
-	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
+	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; ..\..\..\.github\scripts\e2e-test-server-windows.bat server.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
 else ifeq ($(shell uname -s),Linux)
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.so engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 else
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	chmod +x ../../../.github/scripts/e2e-test-server-linux-and-mac.sh && ../../../.github/scripts/e2e-test-server-linux-and-mac.sh ./server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 endif
 
@@ -118,13 +129,14 @@ endif
 ifeq ($(OS),Windows_NT)
 	@powershell -Command "python -m pip install --upgrade pip"
 	@powershell -Command "python -m pip install requests;"
-	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
+	@powershell -Command "mkdir -p examples\server\build\engines\cortex.llamacpp; cd examples\server\build; cp ..\..\..\build\engine.dll engines\cortex.llamacpp; cp ..\..\..\build\bin\llama-server.exe engines\cortex.llamacpp; python ..\..\..\.github\scripts\e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
 else ifeq ($(shell uname -s),Linux)
 	python -m pip install --upgrade pip;
 	python -m pip install requests;
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.so engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	python  ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 else
 	python -m pip install --upgrade pip;
@@ -132,6 +144,7 @@ else
 	@mkdir -p examples/server/build/engines/cortex.llamacpp; \
 	cd examples/server/build/; \
 	cp ../../../build/libengine.dylib engines/cortex.llamacpp/; \
+	cp ../../../build/bin/llama-server engines/cortex.llamacpp/; \
 	python  ../../../.github/scripts/e2e-test-server.py server $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);
 endif
 
diff --git a/base/cortex-common/enginei.h b/base/cortex-common/enginei.h
index 200808e..c1009a1 100644
--- a/base/cortex-common/enginei.h
+++ b/base/cortex-common/enginei.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "json/value.h"
+#include "trantor/utils/AsyncFileLogger.h"
 #include "trantor/utils/Logger.h"
 
 // Interface for inference engine.
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index eac6112..1c743b3 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -15,6 +15,7 @@ add_executable(${PROJECT_NAME}
     server.cc
     dylib.h
     httplib.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../src/file_logger.cc
 )
 
 set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../build_deps/_install)
diff --git a/examples/server/README.md b/examples/server/README.md
new file mode 100644
index 0000000..b4106ad
--- /dev/null
+++ b/examples/server/README.md
@@ -0,0 +1,97 @@
+This application allows you to perform inference with various AI models, including vision, text, and embedding models. It employs different strategies for serving these models efficiently, tailored to their specific requirements.
+
+- Vision Models:
+For vision models, the application utilizes a dedicated, customized server that runs within the same process as the main application. This approach ensures efficient resource utilization and minimizes overhead, providing seamless inference for vision-based tasks.
+
+- Text and Embedding Models:
+To handle text and embedding models, the application spawns a separate child process for each model. This isolation technique prevents potential resource contention and ensures optimal performance, as each model operates independently within its dedicated process environment.
+
+By adopting distinct serving strategies based on model types, the application optimizes resource allocation, maximizes performance, and ensures reliable and efficient inference across a diverse range of AI models.
+
+**API Endpoints**
+
+The application provides the following API endpoints for interacting with the models:
+
+**Load Model**
+
+Endpoint: /loadmodel
+
+Method: curl -X POST
+
+Description: Loads a specified model into the application. For text and embedding models, this endpoint will spawn a new child process to serve the model.
+
+```bash title="Load model"
+curl --location 'http://localhost:3928/loadmodel' \
+--header 'Content-Type: application/json' \
+--data '{
+    "llama_model_path": "/model/llama-2-7b-model.gguf",
+    "model_alias": "llama-2-7b-model",
+    "ctx_len": 512,
+    "ngl": 100,
+    "model_type": "llm"
+  }'
+```
+
+**Chat Completion**
+
+Endpoint: /v1/chat/completions
+
+Method: curl -X POST
+
+Description: Performs chat completion using a loaded text model.
+
+```bash title="Inference"
+curl --location 'http://localhost:3928/v1/chat/completions' \
+--header 'Content-Type: application/json' \
+--data '{
+    "messages": [
+      {
+        "role": "user",
+        "content": "Who won the world series in 2020?"
+      },
+    ],
+    "model": "llama-2-7b-model"
+  }'
+```
+
+**Embedding**
+
+Endpoint: /v1/embeddings
+
+Method: curl -X POST
+
+Description: Requests an embedding using a loaded embedding model.
+```bash title="Embeddings"
+curl --location '127.0.0.1:3928/v1/embeddings' \
+--header 'Content-Type: application/json' \
+--data '{
+        "input": ["hello", "The food was delicious and the waiter..."],
+        "model":"llama-2-7b-model",
+        "encoding_format": "base64"
+}'
+```
+
+**Unload Model**
+
+Endpoint: /unloadmodel
+Method: curl -X POST
+
+Description: Unloads a specified model from the application. For text and embedding models, this endpoint will also stop the associated child process.
+```bash title="Unload Model"
+curl --location '127.0.0.1:3928/unloadmodel' \
+--header 'Content-Type: application/json' \
+--data '{
+    "model": "test"
+}'
+```
+
+**Multiple Models**
+
+This application is designed to handle multiple AI models concurrently, ensuring efficient resource utilization and optimal performance. The serving strategy employed varies based on the model type:
+- Vision models: multiple instances can run within the same process. 
+- Text and embedding models: each model will have its own child process.
+
+**Notes**
+
+For vision models, a customized model is started within the same process to serve the model. No new process is needed.
+For text and embedding models, a new child process is spawned to serve each model.
\ No newline at end of file
diff --git a/examples/server/dylib.h b/examples/server/dylib.h
index 806e2ee..9b7502a 100644
--- a/examples/server/dylib.h
+++ b/examples/server/dylib.h
@@ -58,6 +58,7 @@
  *  The `dylib` class represents a single dynamic library instance,
  *  allowing the access of symbols like functions or global variables
  */
+namespace cortex_cpp {
 class dylib {
 public:
     struct filename_components {
@@ -311,6 +312,7 @@ class dylib {
     }
 };
 
+}
 #undef DYLIB_WIN_MAC_OTHER
 #undef DYLIB_WIN_OTHER
 #undef DYLIB_CPP17
\ No newline at end of file
diff --git a/examples/server/server.cc b/examples/server/server.cc
index ef52d67..a736396 100644
--- a/examples/server/server.cc
+++ b/examples/server/server.cc
@@ -8,14 +8,12 @@
 #include <condition_variable>
 #include <mutex>
 #include <queue>
-#include "trantor/utils/Logger.h"
+#include "../../src/file_logger.h"
+#include "../../src/llama_utils.h"
+
 class Server {
  public:
-  Server() {
-    dylib_ = std::make_unique<dylib>("./engines/cortex.llamacpp", "engine");
-    auto func = dylib_->get_function<EngineI*()>("get_engine");
-    engine_ = func();
-  }
+  Server() {}
 
   ~Server() {
     if (engine_) {
@@ -23,6 +21,19 @@ class Server {
     }
   }
 
+  void Initialize(trantor::AsyncFileLogger* logger) {
+    dylib_ = std::make_unique<cortex_cpp::dylib>("./engines/cortex.llamacpp",
+                                                 "engine");
+    auto func = dylib_->get_function<EngineI*()>("get_engine");
+    engine_ = func();
+    EngineI::EngineLoadOption opts;
+    opts.engine_path = llama_utils::GetExecutableFolderContainerPath() /
+                       "engines" / "cortex.llamacpp";
+    opts.log_path = "./logs/cortex.log";
+    opts.max_log_lines = 10000;
+    engine_->Load(opts);
+  }
+
   void ForceStopInferencing(const std::string& model_id) {
     if (engine_) {
       engine_->StopInferencing(model_id);
@@ -32,7 +43,7 @@ class Server {
   }
 
  public:
-  std::unique_ptr<dylib> dylib_;
+  std::unique_ptr<cortex_cpp::dylib> dylib_;
   EngineI* engine_;
 
   struct SyncQueue {
@@ -86,16 +97,16 @@ inline void signal_handler(int signal) {
 using SyncQueue = Server::SyncQueue;
 
 int main(int argc, char** argv) {
-  //  std::filesystem::create_directories("./logs");
-  // trantor::AsyncFileLogger asyncFileLogger;
-  // asyncFileLogger.setFileName("logs/cortex");
-  // asyncFileLogger.startLogging();
-  // trantor::Logger::setOutputFunction(
-  //     [&](const char* msg, const uint64_t len) {
-  //       asyncFileLogger.output(msg, len);
-  //     },
-  //     [&]() { asyncFileLogger.flush(); });
-  // asyncFileLogger.setFileSizeLimit(100000000);
+  std::filesystem::create_directories("./logs");
+  trantor::FileLogger async_file_logger;
+  async_file_logger.setFileName("logs/cortex.log");
+  async_file_logger.startLogging();
+  trantor::Logger::setOutputFunction(
+      [&](const char* msg, const uint64_t len) {
+        async_file_logger.output_(msg, len);
+      },
+      [&]() { async_file_logger.flush(); });
+  async_file_logger.setFileSizeLimit(100000000);
 
   std::string hostname = "127.0.0.1";
   int port = 3928;
@@ -109,8 +120,9 @@ int main(int argc, char** argv) {
   }
 
   Server server;
+
+  server.Initialize(&async_file_logger);
   //set logger here
-  // server.engine_->SetFileLogger();
 
   SyncJsonReader r;
   auto svr = std::make_unique<httplib::Server>();
@@ -277,7 +289,8 @@ int main(int argc, char** argv) {
   });
 
   shutdown_handler = [&](int) {
-    running = false;
+    // only shutdown by /destroy or sent SIGINT twice
+    // running = false;
   };
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
   struct sigaction sigint_action;
diff --git a/patches/0002-Build-llama-cpp-examples.patch b/patches/0002-Build-llama-cpp-examples.patch
new file mode 100644
index 0000000..7096bb0
--- /dev/null
+++ b/patches/0002-Build-llama-cpp-examples.patch
@@ -0,0 +1,117 @@
+diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
+index 21b31392..18049f0b 100644
+--- a/examples/CMakeLists.txt
++++ b/examples/CMakeLists.txt
+@@ -61,7 +61,7 @@ else()
+             # disabled on Windows because it uses internal functions not exported with LLAMA_API
+             add_subdirectory(quantize-stats)
+         endif()
+-        add_subdirectory(llava)
++        # add_subdirectory(llava)
+         if (GGML_RPC)
+             add_subdirectory(rpc)
+         endif()
+diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
+index 1b7cc8c1..269d8397 100644
+--- a/examples/server/CMakeLists.txt
++++ b/examples/server/CMakeLists.txt
+@@ -34,7 +34,7 @@ endforeach()
+ add_executable(${TARGET} ${TARGET_SRCS})
+ install(TARGETS ${TARGET} RUNTIME)
+ 
+-target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
++target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}/llama.cpp)
+ target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+ 
+ if (LLAMA_SERVER_SSL)
+diff --git a/examples/server/server.cpp b/examples/server/server.cpp
+index e0acc470..98d7ab3a 100644
+--- a/examples/server/server.cpp
++++ b/examples/server/server.cpp
+@@ -3343,6 +3343,8 @@ struct server_context {
+             {"n_embd",      llama_model_n_embd     (model)},
+             {"n_params",    llama_model_n_params   (model)},
+             {"size",        llama_model_size       (model)},
++            {"vram",        llama_get_other_buffer (model)},
++            {"ram",         llama_get_cpu_buffer   (model)},
+         };
+     }
+ };
+diff --git a/include/llama.h b/include/llama.h
+index 298b8d1b..0011dd8e 100644
+--- a/include/llama.h
++++ b/include/llama.h
+@@ -468,6 +468,8 @@ extern "C" {
+     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
+ 
+     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
++    LLAMA_API size_t llama_get_cpu_buffer(const struct llama_model * model);
++    LLAMA_API size_t llama_get_other_buffer(const struct llama_model * model);
+     LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+ 
+     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index 671d2a81..2d802349 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -606,6 +606,14 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+     return &ctx->model;
+ }
+ 
++size_t llama_get_cpu_buffer(const struct llama_model * model) {
++    return model->llama_get_cpu_buffer();
++}
++
++size_t llama_get_other_buffer(const struct llama_model * model) {
++    return model->llama_get_other_buffer();
++}
++
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+     return ctx->cparams.pooling_type;
+ }
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+index 590386e6..e7ead0fb 100644
+--- a/src/llama-model.cpp
++++ b/src/llama-model.cpp
+@@ -3750,6 +3750,26 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
+     return it->second;
+ }
+ 
++size_t llama_model::llama_get_cpu_buffer() const {
++    size_t buffer{0};
++    for (const auto& buf : pimpl->bufs) {
++        if (strcmp(ggml_backend_buffer_name(buf.get()), "CPU") == 0) {
++            buffer += ggml_backend_buffer_get_size(buf.get());
++        }
++    }
++    return buffer;
++}
++
++size_t llama_model::llama_get_other_buffer() const {
++    size_t buffer{0};
++    for (const auto& buf : pimpl->bufs) {
++        if (strcmp(ggml_backend_buffer_name(buf.get()), "CPU") != 0) {
++            buffer += ggml_backend_buffer_get_size(buf.get());
++        }
++    }
++    return buffer;
++}
++
+ //
+ // interface implementation
+ //
+diff --git a/src/llama-model.h b/src/llama-model.h
+index a7c30444..e04233ad 100644
+--- a/src/llama-model.h
++++ b/src/llama-model.h
+@@ -362,6 +362,10 @@ struct llama_model {
+ 
+     const struct ggml_tensor * get_tensor(const char * name) const;
+ 
++    size_t llama_get_cpu_buffer() const;
++
++    size_t llama_get_other_buffer() const;
++
+ private:
+     struct impl;
+     std::unique_ptr<impl> pimpl;
diff --git a/src/llama_data.h b/src/llama_data.h
new file mode 100644
index 0000000..7753cd2
--- /dev/null
+++ b/src/llama_data.h
@@ -0,0 +1,60 @@
+#pragma once
+#include "json/json.h"
+
+struct IsDone {
+  bool is_done;
+  int operator()() { return is_done; }
+};
+
+struct HasError {
+  bool has_error;
+  int operator()() { return has_error; }
+};
+
+struct IsStream {
+  bool is_stream;
+  int operator()() { return is_stream; }
+};
+
+struct StatusCode {
+  int status_code;
+  int operator()() { return status_code; }
+};
+
+struct ResStatus {
+ private:
+  IsDone is_done;
+  HasError has_error;
+  IsStream is_stream;
+  StatusCode status_code;
+
+ public:
+  ResStatus(IsDone is_done, HasError has_error, IsStream is_stream,
+            StatusCode status_code)
+      : is_done(is_done),
+        has_error(has_error),
+        is_stream(is_stream),
+        status_code(status_code) {}
+
+  Json::Value ToJson() {
+    Json::Value status;
+    status["is_done"] = is_done();
+    status["has_error"] = has_error();
+    status["is_stream"] = is_stream();
+    status["status_code"] = status_code();
+    return status;
+  };
+};
+
+struct ResStreamData {
+ private:
+  std::string s;
+
+ public:
+  ResStreamData(std::string s) : s(std::move(s)) {}
+  Json::Value ToJson() {
+    Json::Value d;
+    d["data"] = s;
+    return d;
+  }
+};
\ No newline at end of file
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
index e382404..798a115 100644
--- a/src/llama_engine.cc
+++ b/src/llama_engine.cc
@@ -1,9 +1,13 @@
+// clang-format off
+#include "examples/server/httplib.h"
+// clang-format on
 #include "llama_engine.h"
 #include <chrono>
 #include <cmath>
 #include <limits>
 #include <optional>
 #include "json-schema-to-grammar.h"
+#include "json/json.h"
 #include "json/writer.h"
 #include "llama_utils.h"
 #include "trantor/utils/Logger.h"
@@ -16,6 +20,31 @@
 
 namespace {
 
+const std::unordered_set<std::string> kIgnoredParams = {
+    "model",        "model_alias",     "embedding",  "ai_prompt",
+    "ai_template",  "prompt_template", "mmproj",     "system_prompt",
+    "created",      "stream",          "name",       "os",
+    "owned_by",     "files",           "gpu_arch",   "quantization_method",
+    "engine",       "system_template", "max_tokens", "user_template",
+    "user_prompt",  "min_keep",        "mirostat",   "mirostat_eta",
+    "mirostat_tau", "text_model",      "version",    "n_probs",
+    "object",       "penalize_nl",     "precision",  "size",
+    "stop",         "tfs_z",           "typ_p"};
+
+const std::unordered_map<std::string, std::string> kParamsMap = {
+    {"cpu_threads", "--threads"},
+    {"n_ubatch", "--ubatch-size"},
+    {"n_batch", "--batch-size"},
+    {"n_parallel", "--parallel"},
+    {"temperature", "--temp"},
+    {"top_k", "--top-k"},
+    {"top_p", "--top-p"},
+    {"min_p", "--min-p"},
+    {"dynatemp_exponent", "--dynatemp-exp"},
+    {"ctx_len", "--ctx-size"},
+    {"ngl", "-ngl"},
+};
+
 constexpr const int k200OK = 200;
 constexpr const int k400BadRequest = 400;
 constexpr const int k409Conflict = 409;
@@ -284,13 +313,56 @@ ggml_type kv_cache_type_from_str(const std::string& s) {
   throw std::runtime_error("Unsupported cache type: " + s);
 }
 
+nlohmann::json ConvertJsonCppToNlohmann(const Json::Value& json_cpp_value) {
+  // Base cases
+  if (json_cpp_value.isNull()) {
+    return nullptr;
+  } else if (json_cpp_value.isBool()) {
+    return json_cpp_value.asBool();
+  } else if (json_cpp_value.isInt()) {
+    return json_cpp_value.asInt();
+  } else if (json_cpp_value.isUInt()) {
+    return json_cpp_value.asUInt();
+  } else if (json_cpp_value.isDouble()) {
+    return json_cpp_value.asDouble();
+  } else if (json_cpp_value.isString()) {
+    return json_cpp_value.asString();
+  }
+
+  // Recursive cases
+  if (json_cpp_value.isArray()) {
+    nlohmann::json json_array = nlohmann::json::array();
+    for (const auto& element : json_cpp_value) {
+      json_array.push_back(ConvertJsonCppToNlohmann(element));
+    }
+    return json_array;
+  } else if (json_cpp_value.isObject()) {
+    nlohmann::json json_object = nlohmann::json::object();
+    for (const auto& member : json_cpp_value.getMemberNames()) {
+      json_object[member] = ConvertJsonCppToNlohmann(json_cpp_value[member]);
+    }
+    return json_object;
+  }
+
+  // Should never reach here
+  throw std::runtime_error("Unsupported JSON value type");
+}
+
+Json::Value ParseJsonString(const std::string& json_str) {
+  Json::Value root;
+  Json::Reader reader;
+  reader.parse(json_str, root);
+  return root;
+}
+
 }  // namespace
 
 void LlamaEngine::Load(EngineLoadOption opts) {
+  load_opt_ = opts;
   LOG_DEBUG << "Loading engine..";
-
   LOG_DEBUG << "Is custom engine path: " << opts.is_custom_engine_path;
   LOG_DEBUG << "Engine path: " << opts.engine_path.string();
+  LOG_DEBUG << "Log path: " << opts.log_path.string();
 
   SetFileLogger(opts.max_log_lines, opts.log_path.string());
   SetLogLevel(opts.log_level);
@@ -335,34 +407,47 @@ LlamaEngine::~LlamaEngine() {
   server_map_.clear();
   async_file_logger_.reset();
 
+#if defined(__linux__) || defined(__APPLE__)
+  for (auto const& [_, si] : llama_server_map_) {
+    kill(si.pid, SIGTERM);
+  }
+  llama_server_map_.clear();
+#endif
+
   LOG_INFO << "LlamaEngine destructed successfully";
 }
 
-void LlamaEngine::HandleChatCompletion(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                       http_callback&& callback) {
   // Check if model is loaded
-  if (CheckModelLoaded(callback, llama_utils::GetModelId(*json_body))) {
-    // Model is loaded
-    // Do Inference
+  auto model = llama_utils::GetModelId(*json_body);
+  if (!CheckModelLoaded(callback, model))
+    return;
+
+  if (IsLlamaServerModel(model)) {
+    HandleLlamaCppChatCompletion(json_body, std::move(callback), model);
+  } else {
     HandleInferenceImpl(llama::inferences::fromJson(json_body),
                         std::move(callback));
   }
 }
 
-void LlamaEngine::HandleEmbedding(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::HandleEmbedding(std::shared_ptr<Json::Value> json_body,
+                                  http_callback&& callback) {
   // Check if model is loaded
-  if (CheckModelLoaded(callback, llama_utils::GetModelId(*json_body))) {
-    // Run embedding
+  auto model = llama_utils::GetModelId(*json_body);
+  if (!CheckModelLoaded(callback, model))
+    return;
+
+  if (IsLlamaServerModel(model)) {
+    HandleLlamaCppEmbedding(json_body, std::move(callback), model);
+  } else {
     HandleEmbeddingImpl(json_body, std::move(callback));
   }
 }
 
-void LlamaEngine::LoadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::LoadModel(std::shared_ptr<Json::Value> json_body,
+                            http_callback&& callback) {
   if (std::exchange(print_version_, false)) {
 #if defined(CORTEXLLAMA_VERSION)
     LOG_INFO << "cortex.llamacpp version: " << CORTEXLLAMA_VERSION;
@@ -373,101 +458,121 @@ void LlamaEngine::LoadModel(
   auto model_id = llama_utils::GetModelId(*json_body);
   if (model_id.empty()) {
     LOG_INFO << "Model id is empty in request";
-    Json::Value jsonResp;
-    jsonResp["message"] = "No model id found in request body";
-    Json::Value status;
-    status["is_done"] = false;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["message"] = "No model id found in request body";
+    callback(ResStatus(IsDone{false}, HasError{true}, IsStream{false},
+                       StatusCode{k400BadRequest})
+                 .ToJson(),
+             std::move(json_resp));
     return;
   }
 
   if (auto si = server_map_.find(model_id);
-      si != server_map_.end() && si->second.ctx.model_loaded_external) {
+      (si != server_map_.end() && si->second.ctx.model_loaded_external) ||
+      IsLlamaServerModel(model_id)) {
     LOG_INFO << "Model already loaded";
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model already loaded";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k409Conflict;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["message"] = "Model already loaded";
+    callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k409Conflict})
+                 .ToJson(),
+             std::move(json_resp));
     return;
   }
 
   if (!LoadModelImpl(json_body)) {
     // Error occurred during model loading
-    Json::Value jsonResp;
-    jsonResp["message"] = "Failed to load model";
-    Json::Value status;
-    status["is_done"] = false;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k500InternalServerError;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["message"] = "Failed to load model";
+    callback(ResStatus(IsDone{false}, HasError{true}, IsStream{false},
+                       StatusCode{k500InternalServerError})
+                 .ToJson(),
+             std::move(json_resp));
   } else {
     // Model loaded successfully
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model loaded successfully";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["message"] = "Model loaded successfully";
+    callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             std::move(json_resp));
     LOG_INFO << "Model loaded successfully: " << model_id;
   }
 }
 
-void LlamaEngine::UnloadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::UnloadModel(std::shared_ptr<Json::Value> json_body,
+                              http_callback&& callback) {
   auto model_id = llama_utils::GetModelId(*json_body);
+
+  if (IsLlamaServerModel(model_id)) {
+    bool sent = false;
+#if defined(_WIN32) || defined(_WIN64)
+    sent = GenerateConsoleCtrlEvent(CTRL_C_EVENT,
+                                    llama_server_map_[model_id].pi.dwProcessId);
+#else
+    sent = (kill(llama_server_map_[model_id].pid, SIGTERM) != -1);
+#endif
+    if (sent) {
+      LOG_INFO << "SIGINT signal sent to child process";
+      Json::Value json_resp;
+      json_resp["message"] = "Model unloaded successfully";
+
+      callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                         StatusCode{k200OK})
+                   .ToJson(),
+               std::move(json_resp));
+      llama_server_map_.erase(model_id);
+    } else {
+      LOG_ERROR << "Failed to send SIGINT signal to child process";
+    }
+    return;
+  }
+
   if (CheckModelLoaded(callback, model_id)) {
     auto& l = server_map_[model_id].ctx;
     l.ReleaseResources();
 
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model unloaded successfully";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["message"] = "Model unloaded successfully";
+    callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             std::move(json_resp));
 
     server_map_.erase(model_id);
     LOG_INFO << "Model unloaded successfully";
   }
 }
 
-void LlamaEngine::GetModelStatus(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::GetModelStatus(std::shared_ptr<Json::Value> json_body,
+                                 http_callback&& callback) {
 
   auto model_id = llama_utils::GetModelId(*json_body);
   if (auto is_loaded = CheckModelLoaded(callback, model_id); is_loaded) {
+    if (IsLlamaServerModel(model_id)) {
+      Json::Value json_resp;
+      json_resp["model_loaded"] = is_loaded;
+      callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                         StatusCode{k200OK})
+                   .ToJson(),
+               std::move(json_resp));
+      return;
+    }
     // CheckModelLoaded gurantees that model_id exists in server_ctx_map;
     auto si = server_map_.find(model_id);
-    Json::Value jsonResp;
-    jsonResp["model_loaded"] = is_loaded;
-    jsonResp["model_data"] = si->second.ctx.GetModelProps().dump();
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["model_loaded"] = is_loaded;
+    json_resp["model_data"] = si->second.ctx.GetModelProps().dump();
+    callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             std::move(json_resp));
     LOG_INFO << "Model status responded";
   }
 }
 
-void LlamaEngine::GetModels(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::GetModels(std::shared_ptr<Json::Value> json_body,
+                            http_callback&& callback) {
   Json::Value json_resp;
   Json::Value model_array(Json::arrayValue);
   for (const auto& [m, s] : server_map_) {
@@ -487,15 +592,25 @@ void LlamaEngine::GetModels(
     }
   }
 
+  for (const auto& [m, s] : llama_server_map_) {
+    Json::Value val;
+    val["id"] = m;
+    val["engine"] = "cortex.llamacpp";
+    val["start_time"] = s.start_time;
+    val["model_size"] = s.model_size;
+    val["vram"] = s.vram;
+    val["ram"] = s.ram;
+    val["object"] = "model";
+    model_array.append(val);
+  }
+
   json_resp["object"] = "list";
   json_resp["data"] = model_array;
 
-  Json::Value status;
-  status["is_done"] = true;
-  status["has_error"] = false;
-  status["is_stream"] = false;
-  status["status_code"] = k200OK;
-  callback(std::move(status), std::move(json_resp));
+  callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                     StatusCode{k200OK})
+               .ToJson(),
+           std::move(json_resp));
   LOG_INFO << "Running models responded";
 }
 
@@ -540,11 +655,22 @@ void LlamaEngine::SetFileLogger(int max_log_lines,
         }
       },
       nullptr);
-  freopen(log_path.c_str(), "a", stderr);
-  freopen(log_path.c_str(), "a", stdout);
+  if (!freopen(log_path.c_str(), "a", stderr))
+    LOG_WARN << "Could not open stream for stderr";
+  if (!freopen(log_path.c_str(), "a", stdout))
+    LOG_WARN << "Could not open stream for stdout";
 }
 
 bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
+  if (!json_body) {
+    LOG_ERROR << "Request body is empty!";
+    return false;
+  }
+
+  // Spawn llama.cpp server only if it is chat model
+  if (!json_body->isMember("mmproj") || (*json_body)["mmproj"].isNull()) {
+    return SpawnLlamaServer(*json_body);
+  }
   common_params params;
   std::string model_type;
   auto model_id = llama_utils::GetModelId(*json_body);
@@ -634,21 +760,21 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
       params.cache_type_k = kv_cache_type_from_str(cache_type_k);
     }
     params.cache_type_v = params.cache_type_k;
-    LOG_DEBUG << "cache_type: " << params.cache_type_k;
+    LOG_INFO << "cache_type: " << params.cache_type_k;
 
     auto fa = json_body->get("flash_attn", true).asBool();
     auto force_enable_fa = params.cache_type_k != GGML_TYPE_F16;
     if (force_enable_fa) {
-      LOG_DEBUG << "Using KV cache quantization, force enable Flash Attention";
+      LOG_INFO << "Using KV cache quantization, force enable Flash Attention";
     }
     params.flash_attn = fa || force_enable_fa;
     if (params.flash_attn) {
-      LOG_DEBUG << "Enabled Flash Attention";
+      LOG_INFO << "Enabled Flash Attention";
     }
 
     params.use_mmap = json_body->get("use_mmap", true).asBool();
     if (!params.use_mmap) {
-      LOG_DEBUG << "Disabled mmap";
+      LOG_INFO << "Disabled mmap";
     }
     params.n_predict = json_body->get("n_predict", -1).asInt();
     params.prompt = json_body->get("prompt", "").asString();
@@ -670,7 +796,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
     server_map_[model_id].repeat_last_n =
         json_body->get("repeat_last_n", 32).asInt();
     server_map_[model_id].stop_words = (*json_body)["stop"];
-    LOG_DEBUG << "stop: " << server_map_[model_id].stop_words.toStyledString();
+    LOG_INFO << "stop: " << server_map_[model_id].stop_words.toStyledString();
 
     if (!json_body->operator[]("llama_log_folder").isNull()) {
       common_log_resume(common_log_main());
@@ -733,19 +859,17 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
 
 void LlamaEngine::HandleInferenceImpl(
     llama::inferences::ChatCompletionRequest&& completion,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+    http_callback&& callback) {
   assert(server_map_.find(completion.model_id) != server_map_.end());
   auto& si = server_map_[completion.model_id];
   if (si.ctx.model_type == ModelType::kEmbedding) {
     LOG_WARN << "Not support completion for embedding model";
-    Json::Value jsonResp;
-    jsonResp["message"] = "Not support completion for embedding model";
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k400BadRequest;
-    callback(std::move(status), std::move(jsonResp));
+    Json::Value json_resp;
+    json_resp["message"] = "Not support completion for embedding model";
+    callback(ResStatus(IsDone{true}, HasError{true}, IsStream{false},
+                       StatusCode{k400BadRequest})
+                 .ToJson(),
+             std::move(json_resp));
     return;
   }
   auto formatted_output = si.pre_prompt;
@@ -755,7 +879,7 @@ void LlamaEngine::HandleInferenceImpl(
            << "Generating response for inference request";
 
   json data;
-  json stopWords;
+  json stop_words;
   int no_images = 0;
   // To set default value
 
@@ -920,13 +1044,13 @@ void LlamaEngine::HandleInferenceImpl(
 
   data["prompt"] = formatted_output;
   for (const auto& sw : stop_words_json) {
-    stopWords.push_back(sw.asString());
+    stop_words.push_back(sw.asString());
   }
   // specify default stop words
   // Ensure success case for chatML
-  stopWords.push_back("<|im_end|>");
-  stopWords.push_back(llama_utils::rtrim(si.user_prompt));
-  data["stop"] = stopWords;
+  stop_words.push_back("<|im_end|>");
+  stop_words.push_back(llama_utils::rtrim(si.user_prompt));
+  data["stop"] = stop_words;
 
   bool is_streamed = data["stream"];
   bool include_usage = completion.include_usage;
@@ -972,14 +1096,10 @@ void LlamaEngine::HandleInferenceImpl(
                                to_send, "", include_usage, std::nullopt,
                                logprobs) +
               "\n\n";
-          Json::Value respData;
-          respData["data"] = str;
-          Json::Value status;
-          status["is_done"] = false;
-          status["has_error"] = false;
-          status["is_stream"] = true;
-          status["status_code"] = k200OK;
-          cb(std::move(status), std::move(respData));
+          cb(ResStatus(IsDone{false}, HasError{false}, IsStream{true},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             ResStreamData(str).ToJson());
 
           if (result.stop) {
             LOG_INFO << "Request " << request_id << ": " << "End of result";
@@ -989,7 +1109,6 @@ void LlamaEngine::HandleInferenceImpl(
             // The usage field on this chunk shows the token usage statistics for the entire request,
             // and the choices field will always be an empty array.
             // All other chunks will also include a usage field, but with a null value.
-            Json::Value respData;
             std::optional<Usage> u;
             if (include_usage) {
               u = Usage{result.result_json["tokens_evaluated"],
@@ -1000,13 +1119,10 @@ void LlamaEngine::HandleInferenceImpl(
                 CreateReturnJson(llama_utils::generate_random_string(20), "_",
                                  "", "stop", include_usage, u) +
                 "\n\n" + "data: [DONE]" + "\n\n";
-            respData["data"] = str;
-            Json::Value status;
-            status["is_done"] = true;
-            status["has_error"] = false;
-            status["is_stream"] = true;
-            status["status_code"] = k200OK;
-            cb(std::move(status), std::move(respData));
+            cb(ResStatus(IsDone{true}, HasError{false}, IsStream{true},
+                         StatusCode{k200OK})
+                   .ToJson(),
+               ResStreamData(str).ToJson());
             break;
           }
 
@@ -1014,14 +1130,11 @@ void LlamaEngine::HandleInferenceImpl(
           state->llama.RequestCancel(state->task_id);
           LOG_ERROR << "Request " << request_id << ": "
                     << "Error during inference";
-          Json::Value respData;
-          respData["data"] = std::string();
-          Json::Value status;
-          status["is_done"] = false;
-          status["has_error"] = true;
-          status["is_stream"] = true;
-          status["status_code"] = k200OK;
-          cb(std::move(status), std::move(respData));
+
+          cb(ResStatus(IsDone{false}, HasError{true}, IsStream{true},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             ResStreamData("").ToJson());
           break;
         }
       }
@@ -1030,14 +1143,10 @@ void LlamaEngine::HandleInferenceImpl(
       // Request completed, release it
       if (!state->llama.model_loaded_external) {
         LOG_WARN << "Model unloaded during inference";
-        Json::Value respData;
-        respData["data"] = std::string();
-        Json::Value status;
-        status["is_done"] = false;
-        status["has_error"] = true;
-        status["is_stream"] = true;
-        status["status_code"] = k200OK;
-        cb(std::move(status), std::move(respData));
+        cb(ResStatus(IsDone{false}, HasError{true}, IsStream{true},
+                     StatusCode{k200OK})
+               .ToJson(),
+           ResStreamData("").ToJson());
       }
       LOG_INFO << "Request " << request_id << ": " << "Inference completed";
     });
@@ -1047,7 +1156,7 @@ void LlamaEngine::HandleInferenceImpl(
 
     si.q->runTaskInQueue([this, n, n_probs, request_id, state,
                           cb = std::move(callback), d = std::move(data)]() {
-      Json::Value respData;
+      Json::Value resp_data;
       std::vector<int> task_ids;
       for (int i = 0; i < n; i++) {
         task_ids.push_back(state->llama.RequestCompletion(d, false, false, -1));
@@ -1076,8 +1185,8 @@ void LlamaEngine::HandleInferenceImpl(
             if (n_probs > 0) {
               logprobs = result.result_json["completion_probabilities"];
             }
-            if (respData.empty()) {
-              respData = CreateFullReturnJson(
+            if (resp_data.empty()) {
+              resp_data = CreateFullReturnJson(
                   llama_utils::generate_random_string(20), "_", to_send, "_",
                   prompt_tokens, predicted_tokens, Json::Value("stop"),
                   logprobs);
@@ -1087,34 +1196,31 @@ void LlamaEngine::HandleInferenceImpl(
                   prompt_tokens, predicted_tokens, Json::Value("stop"),
                   logprobs)["choices"][0];
               choice["index"] = index;
-              respData["choices"].append(choice);
+              resp_data["choices"].append(choice);
             }
             index += 1;
 
           } else {
             bool has_error = true;
-            respData["message"] = "Internal error during inference";
+            resp_data["message"] = "Internal error during inference";
             LOG_ERROR << "Request " << request_id << ": "
                       << "Error during inference";
             break;
           }
         }
 
-        Json::Value status;
-        status["is_done"] = true;
-        status["has_error"] = has_error;
-        status["is_stream"] = false;
-        status["status_code"] = k200OK;
-        cb(std::move(status), std::move(respData));
+        cb(ResStatus(IsDone{true}, HasError{has_error}, IsStream{false},
+                     StatusCode{k200OK})
+               .ToJson(),
+           std::move(resp_data));
         LOG_INFO << "Request " << request_id << ": " << "Inference completed";
       }
     });
   }
 }
 
-void LlamaEngine::HandleEmbeddingImpl(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+void LlamaEngine::HandleEmbeddingImpl(std::shared_ptr<Json::Value> json_body,
+                                      http_callback&& callback) {
   auto model_id = llama_utils::GetModelId(*json_body);
   assert(server_map_.find(model_id) != server_map_.end());
   int request_id = ++no_of_requests_;
@@ -1211,34 +1317,30 @@ void LlamaEngine::HandleEmbeddingImpl(
     usage["prompt_tokens"] = prompt_tokens;
     usage["total_tokens"] = prompt_tokens;
     root["usage"] = usage;
-    Json::Value status;
-    status["is_done"] = true;
-    status["has_error"] = false;
-    status["is_stream"] = false;
-    status["status_code"] = k200OK;
-    callback(std::move(status), std::move(root));
+    callback(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             std::move(root));
 
     LOG_INFO << "Request " << request_id << ": " << "Embedding completed";
   });
 }
 
-bool LlamaEngine::CheckModelLoaded(
-    std::function<void(Json::Value&&, Json::Value&&)>& callback,
-    const std::string& model_id) {
+bool LlamaEngine::CheckModelLoaded(http_callback& callback,
+                                   const std::string& model_id) {
   if (auto si = server_map_.find(model_id);
-      si == server_map_.end() || !si->second.ctx.model_loaded_external) {
+      (si == server_map_.end() || !si->second.ctx.model_loaded_external) &&
+      !IsLlamaServerModel(model_id)) {
     LOG_WARN << "Error: model_id: " << model_id
              << ", existed: " << (si != server_map_.end())
              << ", loaded: " << false;
-    Json::Value jsonResp;
-    jsonResp["message"] =
+    Json::Value json_resp;
+    json_resp["message"] =
         "Model has not been loaded, please load model into cortex.llamacpp";
-    Json::Value status;
-    status["is_done"] = false;
-    status["has_error"] = true;
-    status["is_stream"] = false;
-    status["status_code"] = k409Conflict;
-    callback(std::move(status), std::move(jsonResp));
+    callback(ResStatus(IsDone{false}, HasError{true}, IsStream{false},
+                       StatusCode{k409Conflict})
+                 .ToJson(),
+             std::move(json_resp));
     return false;
   }
   return true;
@@ -1297,6 +1399,652 @@ bool LlamaEngine::HasForceStopInferenceModel(const std::string& id) const {
          force_stop_inference_models_.end();
 }
 
+bool LlamaEngine::SpawnLlamaServer(const Json::Value& json_params) {
+  auto wait_for_server_up = [](const std::string& host, int port) {
+    for (size_t i = 0; i < 10; i++) {
+      httplib::Client cli(host + ":" + std::to_string(port));
+      auto res = cli.Get("/health");
+      if (res && res->status == httplib::StatusCode::OK_200) {
+        return true;
+      } else {
+        LOG_INFO << "Wait for server up: " << i;
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+    return false;
+  };
+
+  LOG_DEBUG << "Start to spawn llama-server";
+  auto model = llama_utils::GetModelId(json_params);
+  if (!model.empty()) {
+    llama_server_map_[model].host = "127.0.0.1";
+    llama_server_map_[model].port =
+        llama_utils::GenerateRandomInteger(39400, 39999);
+    llama_server_map_[model].user_prompt =
+        json_params.get("user_prompt", "USER: ").asString();
+    llama_server_map_[model].ai_prompt =
+        json_params.get("ai_prompt", "ASSISTANT: ").asString();
+    llama_server_map_[model].system_prompt =
+        json_params.get("system_prompt", "ASSISTANT's RULE: ").asString();
+    llama_server_map_[model].pre_prompt =
+        json_params.get("pre_prompt", "").asString();
+  } else {
+    LOG_ERROR << "Model is empty";
+    return false;
+  }
+  auto& s = llama_server_map_[model];
+  auto n_parallel = json_params.get("n_parallel", 1).asInt();
+  if (!s.q)
+    s.q = std::make_unique<trantor::ConcurrentTaskQueue>(n_parallel, model);
+#if defined(_WIN32) || defined(_WIN64)
+  // Windows-specific code to create a new process
+  STARTUPINFO si;
+
+  ZeroMemory(&si, sizeof(si));
+  si.cb = sizeof(si);
+  ZeroMemory(&s.pi, sizeof(s.pi));
+  std::string params = ConvertJsonToParams(json_params);
+  params += " --host " + s.host + " --port " + std::to_string(s.port);
+
+  std::string exe_w = "llama-server.exe";
+  std::string wcmds =
+      load_opt_.engine_path.string() + "/" + exe_w + " " + params;
+  LOG_INFO << "wcmds: " << wcmds;
+  LOG_INFO << "deps_path: " << load_opt_.deps_path.string();
+  std::vector<wchar_t> mutable_cmds(wcmds.begin(), wcmds.end());
+  mutable_cmds.push_back(L'\0');
+  // Create child process
+  if (!CreateProcess(
+          NULL,  // No module name (use command line)
+          const_cast<char*>(
+              wcmds
+                  .c_str()),  // Command line (replace with your actual executable)
+          NULL,               // Process handle not inheritable
+          NULL,               // Thread handle not inheritable
+          FALSE,              // Set handle inheritance
+          0,                  // No creation flags
+          NULL,               // Use parent's environment block
+          const_cast<char*>(load_opt_.deps_path.string()
+                                .c_str()),  // Use parent's starting directory
+          &si,                              // Pointer to STARTUPINFO structure
+          &s.pi))  // Pointer to PROCESS_INFORMATION structure
+  {
+    std::cout << "Could not start server: " << GetLastError() << std::endl;
+    return false;
+  } else {
+    if (!wait_for_server_up(s.host, s.port)) {
+      llama_server_map_.erase(model);
+      return false;
+    }
+    std::cout << "Server started" << std::endl;
+  }
+#else
+  // Unix-like system-specific code to fork a child process
+  s.pid = fork();
+
+  if (s.pid < 0) {
+    // Fork failed
+    std::cerr << "Could not start server: " << std::endl;
+    llama_server_map_.erase(model);
+    return false;
+  } else if (s.pid == 0) {
+    // Some engines requires to add lib search path before process being created
+    std::string exe = "llama-server";
+    std::string p = (load_opt_.engine_path / exe).string();
+    if (std::filesystem::exists(p)) {
+      try {
+        std::filesystem::permissions(p,
+                                     std::filesystem::perms::owner_exec |
+                                         std::filesystem::perms::group_exec |
+                                         std::filesystem::perms::others_exec,
+                                     std::filesystem::perm_options::add);
+      } catch (const std::filesystem::filesystem_error& e) {
+        LOG_WARN << "Error: " << e.what();
+      }
+    } else {
+      LOG_ERROR << "llama-server does not exist";
+      return false;
+    }
+
+    std::vector<std::string> params = ConvertJsonToParamsVector(json_params);
+    params.push_back("--host");
+    params.push_back(s.host);
+    params.push_back("--port");
+    params.push_back(std::to_string(s.port));
+    auto convert_to_char_args =
+        [](const std::vector<std::string>& args) -> std::vector<char*> {
+      std::vector<char*> char_args;
+      char_args.reserve(args.size() +
+                        1);  // Reserve space for arguments and null terminator
+
+      for (const auto& arg : args) {
+        char_args.push_back(const_cast<char*>(arg.c_str()));
+      }
+
+      char_args.push_back(nullptr);  // Add null terminator
+
+      return char_args;
+    };
+    std::vector<std::string> v;
+    v.reserve(params.size() + 1);
+    v.push_back(exe);
+    v.insert(v.end(), params.begin(), params.end());
+    auto exec_args = convert_to_char_args(v);
+    execv(p.c_str(), exec_args.data());
+  } else {
+    // Parent process
+    if (!wait_for_server_up(s.host, s.port)) {
+      llama_server_map_.erase(model);
+      return false;
+    }
+    std::cout << "Server started" << std::endl;
+  }
+#endif
+  s.start_time = std::chrono::system_clock::now().time_since_epoch() /
+                 std::chrono::milliseconds(1);
+
+  httplib::Client cli(s.host + ":" + std::to_string(s.port));
+  auto res = cli.Get("/v1/models");
+  if (res && res->status == httplib::StatusCode::OK_200) {
+    LOG_DEBUG << res->body;
+    auto b = ParseJsonString(res->body);
+    if (b.isMember("data") && b["data"].isArray() && b["data"].size() > 0) {
+      s.model_size = b["data"][0]["meta"].get("size", 0).asUInt64();
+      s.vram = b["data"][0]["meta"].get("vram", 0).asUInt64();
+      s.ram = b["data"][0]["meta"].get("ram", 0).asUInt64();
+    }
+  }
+  return true;
+}
+
+std::string LlamaEngine::ConvertJsonToParams(const Json::Value& root) {
+  std::stringstream ss;
+  std::string errors;
+
+  for (const auto& member : root.getMemberNames()) {
+    if (member == "model_path" || member == "llama_model_path") {
+      if (!root[member].isNull()) {
+        ss << "--model" << " ";
+        ss << "\"" << root[member].asString() << "\" ";
+      }
+      continue;
+    } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) {
+      continue;
+    } else if (kParamsMap.find(member) != kParamsMap.end()) {
+      ss << kParamsMap.at(member) << " ";
+      ss << root[member].asString() << " ";
+      continue;
+    } else if (member == "model_type") {
+      if (root[member].asString() == "embedding") {
+        ss << "--embedding" << " ";
+      }
+      continue;
+    }
+
+    ss << "--" << member << " ";
+    if (root[member].isString()) {
+      ss << "\"" << root[member].asString() << "\" ";
+    } else if (root[member].isInt()) {
+      ss << root[member].asInt() << " ";
+    } else if (root[member].isDouble()) {
+      ss << root[member].asDouble() << " ";
+    } else if (root[member].isArray()) {
+      ss << "[";
+      bool first = true;
+      for (const auto& value : root[member]) {
+        if (!first) {
+          ss << ", ";
+        }
+        ss << "\"" << value.asString() << "\"";
+        first = false;
+      }
+      ss << "] ";
+    }
+  }
+
+  return ss.str();
+}
+
+std::vector<std::string> LlamaEngine::ConvertJsonToParamsVector(
+    const Json::Value& root) {
+  std::vector<std::string> res;
+  std::string errors;
+
+  for (const auto& member : root.getMemberNames()) {
+    if (member == "model_path" || member == "llama_model_path") {
+      if (!root[member].isNull()) {
+        res.push_back("--model");
+        res.push_back(root[member].asString());
+      }
+      continue;
+    } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) {
+      continue;
+    } else if (kParamsMap.find(member) != kParamsMap.end()) {
+      res.push_back(kParamsMap.at(member));
+      res.push_back(root[member].asString());
+      continue;
+    } else if (member == "model_type") {
+      if (root[member].asString() == "embedding") {
+        res.push_back("--embedding");
+      }
+      continue;
+    }
+
+    res.push_back("--" + member);
+    if (root[member].isString()) {
+      res.push_back(root[member].asString());
+    } else if (root[member].isInt()) {
+      res.push_back(std::to_string(root[member].asInt()));
+    } else if (root[member].isDouble()) {
+      res.push_back(std::to_string(root[member].asDouble()));
+    } else if (root[member].isArray()) {
+      std::stringstream ss;
+      ss << "[";
+      bool first = true;
+      for (const auto& value : root[member]) {
+        if (!first) {
+          ss << ", ";
+        }
+        ss << "\"" << value.asString() << "\"";
+        first = false;
+      }
+      ss << "] ";
+      res.push_back(ss.str());
+    }
+  }
+
+  return res;
+}
+
+bool LlamaEngine::HandleLlamaCppChatCompletion(
+    std::shared_ptr<Json::Value> json_body, http_callback&& callback,
+    const std::string& model) {
+  if (IsLlamaServerModel(model)) {
+    llama_server_map_.at(model).q->runTaskInQueue(
+        [this, cb = std::move(callback), json_body, model] {
+          auto oaicompat = [&json_body]() -> bool {
+            if (json_body->isMember("logprobs") &&
+                (*json_body)["logprobs"].asBool()) {
+              return false;
+            }
+            if (json_body->isMember("prompt") &&
+                !(*json_body)["prompt"].asString().empty()) {
+              return false;
+            }
+            return true;
+          }();
+          if (oaicompat) {
+            HandleOpenAiChatCompletion(json_body,
+                                       const_cast<http_callback&&>(cb), model);
+          } else {
+            HandleNonOpenAiChatCompletion(
+                json_body, const_cast<http_callback&&>(cb), model);
+          }
+        });
+    LOG_DEBUG << "Done HandleChatCompletion";
+    return true;
+  }
+  return false;
+}
+
+void LlamaEngine::HandleOpenAiChatCompletion(
+    std::shared_ptr<Json::Value> json_body, http_callback&& cb,
+    const std::string& model) {
+  auto is_stream = (*json_body).get("stream", false).asBool();
+  auto include_usage = [&json_body, is_stream]() -> bool {
+    if (is_stream) {
+      if (json_body->isMember("stream_options") &&
+          !(*json_body)["stream_options"].isNull()) {
+        return (*json_body)["stream_options"]
+            .get("include_usage", false)
+            .asBool();
+      }
+      return false;
+    }
+    return false;
+  }();
+
+  auto n = [&json_body, is_stream]() -> int {
+    if (is_stream)
+      return 1;
+    return (*json_body).get("n", 1).asInt();
+  }();
+
+  auto& s = llama_server_map_.at(model);
+
+  // Format logit_bias
+  if (json_body->isMember("logit_bias")) {
+    auto logit_bias =
+        llama::inferences::ChatCompletionRequest::ConvertLogitBiasToArray(
+            (*json_body)["logit_bias"]);
+    (*json_body)["logit_bias"] = logit_bias;
+  }
+
+  httplib::Client cli(s.host + ":" + std::to_string(s.port));
+  auto data = ConvertJsonCppToNlohmann(*json_body);
+
+  // llama.cpp server only supports n = 1
+  data["n"] = 1;
+  auto data_str = data.dump();
+  LOG_DEBUG << "data_str: " << data_str;
+  cli.set_read_timeout(std::chrono::seconds(60));
+  if (is_stream) {
+    // std::cout << "> ";
+    httplib::Request req;
+    req.headers = httplib::Headers();
+    req.set_header("Content-Type", "application/json");
+    req.method = "POST";
+    req.path = "/v1/chat/completions";
+    req.body = data_str;
+    req.content_receiver = [cb, include_usage, n, is_stream](
+                               const char* data, size_t data_length,
+                               uint64_t offset, uint64_t total_length) {
+      std::string s(data, data_length);
+      if (s.find("[DONE]") != std::string::npos) {
+        LOG_DEBUG << "[DONE]";
+        cb(ResStatus(IsDone{true}, HasError{false}, IsStream{true},
+                     StatusCode{k200OK})
+               .ToJson(),
+           ResStreamData(s).ToJson());
+        return false;
+      }
+
+      // For openai api compatibility
+      if (!include_usage && s.find("completion_tokens") != std::string::npos) {
+        return true;
+      }
+
+      cb(ResStatus(IsDone{false}, HasError{false}, IsStream{true},
+                   StatusCode{k200OK})
+             .ToJson(),
+         ResStreamData(s).ToJson());
+      LOG_DEBUG << s;
+      return true;
+    };
+    cli.send(req);
+    LOG_DEBUG << "Sent";
+  } else {
+    Json::Value result;
+    // multiple choices
+    for (int i = 0; i < n; i++) {
+      auto res = cli.Post("/v1/chat/completions", httplib::Headers(),
+                          data_str.data(), data_str.size(), "application/json");
+      if (res) {
+        LOG_DEBUG << res->body;
+        auto r = ParseJsonString(res->body);
+        if (i == 0) {
+          result = r;
+        } else {
+          r["choices"][0]["index"] = i;
+          result["choices"].append(r["choices"][0]);
+          result["usage"]["completion_tokens"] =
+              result["usage"]["completion_tokens"].asInt() +
+              r["usage"]["completion_tokens"].asInt();
+          result["usage"]["prompt_tokens"] =
+              result["usage"]["prompt_tokens"].asInt() +
+              r["usage"]["prompt_tokens"].asInt();
+          result["usage"]["total_tokens"] =
+              result["usage"]["total_tokens"].asInt() +
+              r["usage"]["total_tokens"].asInt();
+        }
+
+        if (i == n - 1) {
+          cb(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             std::move(result));
+        }
+
+      } else {
+        std::cout << "Error" << std::endl;
+        cb(ResStatus(IsDone{true}, HasError{true}, IsStream{false},
+                     StatusCode{k500InternalServerError})
+               .ToJson(),
+           Json::Value());
+        break;
+      }
+    }
+  }
+}
+
+void LlamaEngine::HandleNonOpenAiChatCompletion(
+    std::shared_ptr<Json::Value> json_body, http_callback&& cb,
+    const std::string& model) {
+  LOG_DEBUG << "Handle non OpenAI";
+  LOG_DEBUG << json_body->toStyledString();
+  auto is_stream = (*json_body).get("stream", false).asBool();
+  auto include_usage = [&json_body, is_stream]() -> bool {
+    if (is_stream) {
+      if (json_body->isMember("stream_options") &&
+          !(*json_body)["stream_options"].isNull()) {
+        return (*json_body)["stream_options"]
+            .get("include_usage", false)
+            .asBool();
+      }
+      return false;
+    }
+    return false;
+  }();
+
+  auto n = [&json_body, is_stream]() -> int {
+    if (is_stream)
+      return 1;
+    return (*json_body).get("n", 1).asInt();
+  }();
+
+  auto& s = llama_server_map_.at(model);
+
+  // Format logit_bias
+  if (json_body->isMember("logit_bias")) {
+    auto logit_bias =
+        llama::inferences::ChatCompletionRequest::ConvertLogitBiasToArray(
+            (*json_body)["logit_bias"]);
+    (*json_body)["logit_bias"] = logit_bias;
+  }
+
+  httplib::Client cli(s.host + ":" + std::to_string(s.port));
+  auto get_message = [](const Json::Value& msg_content) -> std::string {
+    if (msg_content.isArray()) {
+      for (const auto& mc : msg_content) {
+        if (mc["type"].asString() == "text") {
+          return mc["text"].asString();
+        }
+      }
+    } else {
+      return msg_content.asString();
+    }
+    return "";
+  };
+
+  // If prompt is provided, use it as the prompt
+  if (!json_body->isMember("prompt") ||
+      (*json_body)["prompt"].asString().empty()) {
+    auto formatted_output = s.pre_prompt;
+    for (const auto& message : (*json_body)["messages"]) {
+      auto input_role = message["role"].asString();
+      std::string role;
+      if (input_role == "user") {
+        role = s.user_prompt;
+      } else if (input_role == "assistant") {
+        role = s.ai_prompt;
+      } else if (input_role == "system") {
+        role = s.system_prompt;
+      } else {
+        role = input_role;
+      }
+
+      if (auto content = get_message(message["content"]); !content.empty()) {
+        formatted_output += role + content;
+      }
+    }
+    formatted_output += s.ai_prompt;
+    (*json_body)["prompt"] = formatted_output;
+  }
+
+  auto data = ConvertJsonCppToNlohmann(*json_body);
+
+  // llama.cpp server only supports n = 1
+  data["n"] = 1;
+  auto data_str = data.dump();
+  LOG_DEBUG << "data_str: " << data_str;
+  cli.set_read_timeout(std::chrono::seconds(60));
+  int n_probs = json_body->get("n_probs", 0).asInt();
+  if (is_stream) {
+    // std::cout << "> ";
+    httplib::Request req;
+    req.headers = httplib::Headers();
+    req.set_header("Content-Type", "application/json");
+    req.method = "POST";
+    req.path = "/v1/completions";
+    req.body = data_str;
+    req.content_receiver = [cb, include_usage, n, is_stream, n_probs, model](
+                               const char* data, size_t data_length,
+                               uint64_t offset, uint64_t total_length) {
+      std::string s(data, data_length);
+      LOG_DEBUG << s;
+      if (s.size() > 6) {
+        s = s.substr(6);
+      }
+      auto json_data = ParseJsonString(s);
+
+      // DONE
+      if (json_data.isMember("timings")) {
+        std::optional<Usage> u;
+        if (include_usage) {
+          u = Usage{json_data["tokens_evaluated"].asInt(),
+                    json_data["tokens_predicted"].asInt()};
+        }
+        const std::string str =
+            "data: " +
+            CreateReturnJson(llama_utils::generate_random_string(20), model, "",
+                             "stop", include_usage, u) +
+            "\n\n" + "data: [DONE]" + "\n\n";
+
+        cb(ResStatus(IsDone{true}, HasError{false}, IsStream{is_stream},
+                     StatusCode{k200OK})
+               .ToJson(),
+           ResStreamData(str).ToJson());
+        return false;
+      }
+
+      json logprobs;
+      if (n_probs > 0) {
+        logprobs =
+            ConvertJsonCppToNlohmann(json_data["completion_probabilities"]);
+      }
+      std::string to_send;
+      if (json_data.isMember("choices") && json_data["choices"].isArray() &&
+          json_data["choices"].size() > 0) {
+        to_send = json_data["choices"][0].get("text", "").asString();
+      }
+      const std::string str =
+          "data: " +
+          CreateReturnJson(llama_utils::generate_random_string(20), model,
+                           to_send, "", include_usage, std::nullopt, logprobs) +
+          "\n\n";
+      cb(ResStatus(IsDone{false}, HasError{false}, IsStream{true},
+                   StatusCode{k200OK})
+             .ToJson(),
+         ResStreamData(str).ToJson());
+
+      return true;
+    };
+    cli.send(req);
+    LOG_DEBUG << "Sent";
+  } else {
+    Json::Value result;
+    int prompt_tokens = 0;
+    int predicted_tokens = 0;
+    // multiple choices
+    for (int i = 0; i < n; i++) {
+      auto res = cli.Post("/v1/completions", httplib::Headers(),
+                          data_str.data(), data_str.size(), "application/json");
+      if (res) {
+        LOG_DEBUG << res->body;
+        auto r = ParseJsonString(res->body);
+        json logprobs;
+        prompt_tokens += r["tokens_evaluated"].asInt();
+        predicted_tokens += r["tokens_predicted"].asInt();
+        std::string to_send = r["content"].asString();
+        llama_utils::ltrim(to_send);
+        if (n_probs > 0) {
+          logprobs = ConvertJsonCppToNlohmann(r["completion_probabilities"]);
+        }
+
+        if (i == 0) {
+          result = CreateFullReturnJson(
+              llama_utils::generate_random_string(20), model, to_send, "_",
+              prompt_tokens, predicted_tokens, Json::Value("stop"), logprobs);
+        } else {
+          auto choice = CreateFullReturnJson(
+              llama_utils::generate_random_string(20), model, to_send, "_",
+              prompt_tokens, predicted_tokens, Json::Value("stop"),
+              logprobs)["choices"][0];
+          choice["index"] = i;
+          result["choices"].append(choice);
+          result["usage"]["completion_tokens"] = predicted_tokens;
+          result["usage"]["prompt_tokens"] = prompt_tokens;
+          result["usage"]["total_tokens"] = predicted_tokens + prompt_tokens;
+        }
+
+        if (i == n - 1) {
+          cb(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                       StatusCode{k200OK})
+                 .ToJson(),
+             std::move(result));
+        }
+
+      } else {
+        LOG_ERROR << "Error";
+        cb(ResStatus(IsDone{true}, HasError{true}, IsStream{false},
+                     StatusCode{k500InternalServerError})
+               .ToJson(),
+           Json::Value());
+        break;
+      }
+    }
+  }
+}
+
+bool LlamaEngine::HandleLlamaCppEmbedding(
+    std::shared_ptr<Json::Value> json_body, http_callback&& callback,
+    const std::string& model) {
+  if (IsLlamaServerModel(model)) {
+    llama_server_map_.at(model).q->runTaskInQueue(
+        [this, cb = std::move(callback), json_body, model] {
+          auto& s = llama_server_map_.at(model);
+          httplib::Client cli(s.host + ":" + std::to_string(s.port));
+          auto data = ConvertJsonCppToNlohmann(*json_body);
+          auto data_str = data.dump();
+
+          LOG_DEBUG << "data_str: " << data_str;
+          auto res =
+              cli.Post("/v1/embeddings", httplib::Headers(), data_str.data(),
+                       data_str.size(), "application/json");
+          if (res) {
+            // std::cout << res->body << std::endl;
+            cb(ResStatus(IsDone{true}, HasError{false}, IsStream{false},
+                         StatusCode{k200OK})
+                   .ToJson(),
+               ParseJsonString(res->body));
+          } else {
+            std::cout << "Error" << std::endl;
+            cb(ResStatus(IsDone{true}, HasError{true}, IsStream{false},
+                         StatusCode{k500InternalServerError})
+                   .ToJson(),
+               Json::Value());
+          }
+        });
+    LOG_DEBUG << "Done HandleEmbedding";
+    return true;
+  }
+  return false;
+}
+
+bool LlamaEngine::IsLlamaServerModel(const std::string& model) const {
+  return llama_server_map_.find(model) != llama_server_map_.end();
+}
+
 extern "C" {
 EngineI* get_engine() {
   return new LlamaEngine();
diff --git a/src/llama_engine.h b/src/llama_engine.h
index fe17934..8ff830d 100644
--- a/src/llama_engine.h
+++ b/src/llama_engine.h
@@ -6,10 +6,13 @@
 #include "cortex-common/enginei.h"
 #include "file_logger.h"
 #include "llama.h"
+#include "llama_data.h"
 #include "llama_server_context.h"
 #include "trantor/utils/ConcurrentTaskQueue.h"
 #include "trantor/utils/Logger.h"
 
+using http_callback = std::function<void(Json::Value&&, Json::Value&&)>;
+
 class LlamaEngine : public EngineI {
  public:
   constexpr static auto kEngineName = "cortex.llamacpp";
@@ -18,45 +21,54 @@ class LlamaEngine : public EngineI {
 
   ~LlamaEngine() final;
 
-  // #### Interface ####
+  // Load the engine with the specified options.
   void Load(EngineLoadOption opts) final;
 
+  // Unload the engine with the specified options.
   void Unload(EngineUnloadOption opts) final;
 
-  void HandleChatCompletion(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void HandleEmbedding(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void LoadModel(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void UnloadModel(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void GetModelStatus(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void GetModels(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
+  // Handle a chat completion request with the provided JSON body and callback.
+  void HandleChatCompletion(std::shared_ptr<Json::Value> json_body,
+                            http_callback&& callback) final;
+
+  // Handle an embedding request with the provided JSON body and callback.
+  void HandleEmbedding(std::shared_ptr<Json::Value> json_body,
+                       http_callback&& callback) final;
+
+  // Load a model with the provided JSON body and callback.
+  void LoadModel(std::shared_ptr<Json::Value> json_body,
+                 http_callback&& callback) final;
+
+  // Unload a model with the provided JSON body and callback.
+  void UnloadModel(std::shared_ptr<Json::Value> json_body,
+                   http_callback&& callback) final;
+
+  // Get the status of a model with the provided JSON body and callback.
+  void GetModelStatus(std::shared_ptr<Json::Value> json_body,
+                      http_callback&& callback) final;
+
+  // Get the list of available models with the provided JSON body and callback.
+  void GetModels(std::shared_ptr<Json::Value> json_body,
+                 http_callback&& callback) final;
+
+  // Set the file logger with the maximum number of log lines and log file path.
   void SetFileLogger(int max_log_lines, const std::string& log_path) final;
+
+  // Set the log level for the engine.
   void SetLogLevel(trantor::Logger::LogLevel log_level =
                        trantor::Logger::LogLevel::kInfo) final;
+
+  // Stop the inferencing process for the specified model.
   void StopInferencing(const std::string& model_id) final;
 
  private:
-  bool LoadModelImpl(std::shared_ptr<Json::Value> jsonBody);
+  bool LoadModelImpl(std::shared_ptr<Json::Value> json_body);
   void HandleInferenceImpl(
       llama::inferences::ChatCompletionRequest&& completion,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback);
-  void HandleEmbeddingImpl(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback);
-  bool CheckModelLoaded(
-      std::function<void(Json::Value&&, Json::Value&&)>& callback,
-      const std::string& model_id);
+      http_callback&& callback);
+  void HandleEmbeddingImpl(std::shared_ptr<Json::Value> json_body,
+                           http_callback&& callback);
+  bool CheckModelLoaded(http_callback& callback, const std::string& model_id);
   void WarmUpModel(const std::string& model_id);
   bool ShouldInitBackend() const;
 
@@ -64,6 +76,31 @@ class LlamaEngine : public EngineI {
   void RemoveForceStopInferenceModel(const std::string& id);
   bool HasForceStopInferenceModel(const std::string& id) const;
 
+  bool SpawnLlamaServer(const Json::Value& json_params);
+  std::string ConvertJsonToParams(const Json::Value& root);
+  std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root);
+
+  bool HandleLlamaCppChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                    http_callback&& callback,
+                                    const std::string& model);
+
+  // Handle an OpenAI chat completion request with the provided JSON body, callback, and model.
+  void HandleOpenAiChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                  http_callback&& callback,
+                                  const std::string& model);
+
+  // Handle a non-OpenAI chat completion request with the provided JSON body, callback, and model.
+  void HandleNonOpenAiChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                     http_callback&& callback,
+                                     const std::string& model);
+
+  // Handle a LLaMA C++ embedding request with the provided JSON body, callback, and model.
+  bool HandleLlamaCppEmbedding(std::shared_ptr<Json::Value> json_body,
+                               http_callback&& callback,
+                               const std::string& model);
+
+  bool IsLlamaServerModel(const std::string& model) const;
+
  private:
   struct ServerInfo {
     LlamaServerContext ctx;
@@ -81,8 +118,29 @@ class LlamaEngine : public EngineI {
     Json::Value stop_words;
   };
 
+  struct ServerConfig {
+    std::unique_ptr<trantor::ConcurrentTaskQueue> q;
+    std::string user_prompt;
+    std::string ai_prompt;
+    std::string system_prompt;
+    std::string pre_prompt;
+    std::string host;
+    int port;
+#if defined(_WIN32) || defined(_WIN64)
+    PROCESS_INFORMATION pi;
+#else
+    pid_t pid;
+#endif
+    uint64_t start_time;
+    uint32_t vram;
+    uint32_t ram;
+    uint64_t model_size;
+  };
+
   // key: model_id, value: ServerInfo
   std::unordered_map<std::string, ServerInfo> server_map_;
+  // TODO(sang) use variant map
+  std::unordered_map<std::string, ServerConfig> llama_server_map_;
   // lock the force_stop_inference_models_
   mutable std::mutex fsi_mtx_;
   std::unordered_set<std::string> force_stop_inference_models_;
@@ -91,7 +149,9 @@ class LlamaEngine : public EngineI {
   std::atomic<int> no_of_chats_ = 0;
 
   bool print_version_ = true;
+
   std::unique_ptr<trantor::FileLogger> async_file_logger_;
+  EngineLoadOption load_opt_;
 
 #if defined(_WIN32)
   std::vector<DLL_DIRECTORY_COOKIE> cookies_;
diff --git a/src/llama_utils.h b/src/llama_utils.h
index 75acb8b..2465ec0 100644
--- a/src/llama_utils.h
+++ b/src/llama_utils.h
@@ -3,12 +3,17 @@
 #include <cstdio>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <ostream>
 #include <random>
 #include <regex>
 #include <string>
 #include <vector>
 
+#if defined(__APPLE__)
+#include <mach-o/dyld.h>
+#endif
+
 namespace llama_utils {
 
 inline std::string models_folder = "./models";
@@ -176,4 +181,47 @@ inline std::string GetModelId(const Json::Value& jsonBody) {
   return {};
 }
 
+inline int GenerateRandomInteger(int min, int max) {
+  static std::random_device rd;   // Seed for the random number engine
+  static std::mt19937 gen(rd());  // Mersenne Twister random number engine
+  std::uniform_int_distribution<> dis(
+      min, max);  // Distribution for the desired range
+
+  return dis(gen);  // Generate and return a random integer within the range
+}
+
+std::filesystem::path GetExecutableFolderContainerPath() {
+#if defined(__APPLE__) && defined(__MACH__)
+  char buffer[1024];
+  uint32_t size = sizeof(buffer);
+
+  if (_NSGetExecutablePath(buffer, &size) == 0) {
+    // CTL_DBG("Executable path: " << buffer);
+    return std::filesystem::path{buffer}.parent_path();
+  } else {
+    LOG_ERROR << "Failed to get executable path";
+    return std::filesystem::current_path();
+  }
+#elif defined(__linux__)
+  char buffer[1024];
+  ssize_t len = readlink("/proc/self/exe", buffer, sizeof(buffer) - 1);
+  if (len != -1) {
+    buffer[len] = '\0';
+    // CTL_DBG("Executable path: " << buffer);
+    return std::filesystem::path{buffer}.parent_path();
+  } else {
+    LOG_ERROR << "Failed to get executable path";
+    return std::filesystem::current_path();
+  }
+#elif defined(_WIN32)
+  wchar_t buffer[MAX_PATH];
+  GetModuleFileNameW(NULL, buffer, MAX_PATH);
+  // CTL_DBG("Executable path: " << buffer);
+  return std::filesystem::path{buffer}.parent_path();
+#else
+  LOG_ERROR << "Unsupported platform!";
+  return std::filesystem::current_path();
+#endif
+}
+
 }  // namespace llama_utils
\ No newline at end of file