From 4e22cbc51da7aa9a0a251881f8d127eca23edf4d Mon Sep 17 00:00:00 2001 From: csh <458761603@qq.com> Date: Tue, 6 Aug 2024 21:47:30 +0800 Subject: [PATCH 1/4] [Backend] Support frontend change ctx_size & batch_size Signed-off-by: csh <458761603@qq.com> --- moxin-backend/src/backend_impls/api_server.rs | 12 ++++++++++-- moxin-backend/src/backend_impls/mod.rs | 4 ++++ moxin-protocol/src/protocol.rs | 5 ++++- src/data/chats/model_loader.rs | 2 ++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/moxin-backend/src/backend_impls/api_server.rs b/moxin-backend/src/backend_impls/api_server.rs index 1352c563..55d10ba8 100644 --- a/moxin-backend/src/backend_impls/api_server.rs +++ b/moxin-backend/src/backend_impls/api_server.rs @@ -35,7 +35,11 @@ fn create_wasi( load_model: &LoadModelOptions, ) -> wasmedge_sdk::WasmEdgeResult { // use model metadata context size - let ctx_size = Some(format!("{}", file.context_size.min(8 * 1024))); + let ctx_size = if load_model.n_ctx > 0 { + Some(format!("{}", load_model.n_ctx)) + } else { + Some(format!("{}", file.context_size.min(8 * 1024))) + }; let n_gpu_layers = match load_model.gpu_layers { moxin_protocol::protocol::GPULayers::Specific(n) => Some(n.to_string()), @@ -43,7 +47,11 @@ fn create_wasi( }; // Set n_batch to a fixed value of 128. - let batch_size = Some(format!("128")); + let batch_size = if load_model.n_batch > 0 { + Some(format!("{}", load_model.n_batch)) + } else { + Some("128".to_string()) + }; let mut prompt_template = load_model.prompt_template.clone(); if prompt_template.is_none() && !file.prompt_template.is_empty() { diff --git a/moxin-backend/src/backend_impls/mod.rs b/moxin-backend/src/backend_impls/mod.rs index d91f0653..152f2bfc 100644 --- a/moxin-backend/src/backend_impls/mod.rs +++ b/moxin-backend/src/backend_impls/mod.rs @@ -139,6 +139,8 @@ fn test_chat() { rope_freq_scale: 0.0, rope_freq_base: 0.0, context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit, + n_batch: 128, + n_ctx: 1024, }, tx, ); @@ -209,6 +211,8 @@ fn test_chat_stop() { prompt_template: None, gpu_layers: moxin_protocol::protocol::GPULayers::Max, use_mlock: false, + n_batch: 128, + n_ctx: 1024, rope_freq_scale: 0.0, rope_freq_base: 0.0, context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit, diff --git a/moxin-protocol/src/protocol.rs b/moxin-protocol/src/protocol.rs index 694ef96c..c9d71164 100644 --- a/moxin-protocol/src/protocol.rs +++ b/moxin-protocol/src/protocol.rs @@ -28,9 +28,12 @@ pub struct LoadModelOptions { pub prompt_template: Option, pub gpu_layers: GPULayers, pub use_mlock: bool, + // if 0, the backend will use the default value + pub n_batch: u32, + // if 0, the backend will use the default value + pub n_ctx: u32, pub rope_freq_scale: f32, pub rope_freq_base: f32, - // TBD Not really sure if this is something backend manages or if it is matter of // the client (if it is done by tweaking the JSON payload for the chat completition) pub context_overflow_policy: ContextOverflowPolicy, diff --git a/src/data/chats/model_loader.rs b/src/data/chats/model_loader.rs index 98e723d4..e16279af 100644 --- a/src/data/chats/model_loader.rs +++ b/src/data/chats/model_loader.rs @@ -34,6 +34,8 @@ impl ModelLoader { rope_freq_base: 0.0, context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit, + n_batch: 0, + n_ctx: 0, }, tx, ); From 9efb7547ef92e3ba210ea1e5bc732e8e54b1c2a7 Mon Sep 17 00:00:00 2001 From: csh <458761603@qq.com> Date: Tue, 6 Aug 2024 21:47:30 +0800 Subject: [PATCH 2/4] [Backend] let the system decide api-server.wasm port Signed-off-by: csh <458761603@qq.com> --- moxin-backend/src/backend_impls/api_server.rs | 23 ++++++++++++++----- moxin-backend/src/backend_impls/chat_ui.rs | 2 ++ moxin-protocol/src/protocol.rs | 4 ++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/moxin-backend/src/backend_impls/api_server.rs b/moxin-backend/src/backend_impls/api_server.rs index 55d10ba8..380c278f 100644 --- a/moxin-backend/src/backend_impls/api_server.rs +++ b/moxin-backend/src/backend_impls/api_server.rs @@ -23,6 +23,7 @@ static WASM: &[u8] = include_bytes!("../../wasm/llama-api-server.wasm"); pub struct LLamaEdgeApiServer { id: String, listen_addr: SocketAddr, + load_model_options: LoadModelOptions, wasm_module: Module, running_controller: tokio::sync::broadcast::Sender<()>, #[allow(dead_code)] @@ -141,17 +142,23 @@ impl BackendModel for LLamaEdgeApiServer { options: moxin_protocol::protocol::LoadModelOptions, tx: std::sync::mpsc::Sender>, ) -> Self { + let load_model_options = options.clone(); let mut need_reload = true; let (wasm_module, listen_addr) = if let Some(old_model) = &old_model { - if old_model.id == file.id.as_str() { + if old_model.id == file.id.as_str() + && old_model.load_model_options.n_ctx == options.n_ctx + && old_model.load_model_options.n_batch == options.n_batch + { need_reload = false; } (old_model.wasm_module.clone(), old_model.listen_addr) } else { - ( - Module::from_bytes(None, WASM).unwrap(), - ([0, 0, 0, 0], 8080).into(), - ) + let new_addr = std::net::TcpListener::bind("localhost:0") + .unwrap() + .local_addr() + .unwrap(); + + (Module::from_bytes(None, WASM).unwrap(), new_addr) }; if !need_reload { @@ -160,6 +167,7 @@ impl BackendModel for LLamaEdgeApiServer { file_id: file.id.to_string(), model_id: file.model_id, information: "".to_string(), + listen_port: listen_addr.port(), }, ))); return old_model.unwrap(); @@ -173,7 +181,8 @@ impl BackendModel for LLamaEdgeApiServer { let file_id = file.id.to_string(); - let url = format!("http://localhost:{}/echo", listen_addr.port()); + let listen_port = listen_addr.port(); + let url = format!("http://localhost:{}/echo", listen_port); let file_ = file.clone(); @@ -205,6 +214,7 @@ impl BackendModel for LLamaEdgeApiServer { file_id: file_.id.to_string(), model_id: file_.model_id, information: "".to_string(), + listen_port, }, ))); } else { @@ -220,6 +230,7 @@ impl BackendModel for LLamaEdgeApiServer { listen_addr, running_controller, model_thread, + load_model_options, }; new_model diff --git a/moxin-backend/src/backend_impls/chat_ui.rs b/moxin-backend/src/backend_impls/chat_ui.rs index 411ca05d..337e7d2c 100644 --- a/moxin-backend/src/backend_impls/chat_ui.rs +++ b/moxin-backend/src/backend_impls/chat_ui.rs @@ -228,6 +228,7 @@ fn get_input( file_id, model_id, information: String::new(), + listen_port: 0, }))); } @@ -430,6 +431,7 @@ impl super::BackendModel for ChatBotModel { file_id: file.id.to_string(), model_id: file.model_id, information: "".to_string(), + listen_port: 0, }))); return old_model.unwrap(); } diff --git a/moxin-protocol/src/protocol.rs b/moxin-protocol/src/protocol.rs index c9d71164..cfded354 100644 --- a/moxin-protocol/src/protocol.rs +++ b/moxin-protocol/src/protocol.rs @@ -44,6 +44,10 @@ pub struct LoadedModelInfo { pub file_id: FileID, pub model_id: ModelID, + // The port where the local server is listening for the model. + // if 0, the server is not running. + pub listen_port: u16, + // JSON formatted string with the model information. See "Model Inspector" in LMStudio. pub information: String, } From fd6fc34fac5ffec2a38e9d323b8ed5e583a4ca59 Mon Sep 17 00:00:00 2001 From: csh <458761603@qq.com> Date: Tue, 6 Aug 2024 21:47:30 +0800 Subject: [PATCH 3/4] Fix chat without stream error Signed-off-by: csh <458761603@qq.com> --- moxin-protocol/src/open_ai.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/moxin-protocol/src/open_ai.rs b/moxin-protocol/src/open_ai.rs index ea277a2f..d00c60d3 100644 --- a/moxin-protocol/src/open_ai.rs +++ b/moxin-protocol/src/open_ai.rs @@ -106,6 +106,7 @@ pub struct ChatResponseData { pub choices: Vec, pub created: u32, pub model: ModelID, + #[serde(default)] pub system_fingerprint: String, pub usage: UsageData, From ecb4cb00b4a384e9952c00dfeec6372bbb6ab9d2 Mon Sep 17 00:00:00 2001 From: csh <458761603@qq.com> Date: Tue, 6 Aug 2024 22:58:59 +0800 Subject: [PATCH 4/4] [Backend] change LoadModelOptions.n_ctx from u32 to Option Signed-off-by: csh <458761603@qq.com> --- moxin-backend/src/backend_impls/api_server.rs | 8 ++++---- moxin-backend/src/backend_impls/mod.rs | 8 ++++---- moxin-protocol/src/protocol.rs | 6 ++---- src/data/chats/model_loader.rs | 4 ++-- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/moxin-backend/src/backend_impls/api_server.rs b/moxin-backend/src/backend_impls/api_server.rs index 380c278f..1cf5a324 100644 --- a/moxin-backend/src/backend_impls/api_server.rs +++ b/moxin-backend/src/backend_impls/api_server.rs @@ -36,8 +36,8 @@ fn create_wasi( load_model: &LoadModelOptions, ) -> wasmedge_sdk::WasmEdgeResult { // use model metadata context size - let ctx_size = if load_model.n_ctx > 0 { - Some(format!("{}", load_model.n_ctx)) + let ctx_size = if let Some(n_ctx) = load_model.n_ctx { + Some(format!("{}", n_ctx)) } else { Some(format!("{}", file.context_size.min(8 * 1024))) }; @@ -48,8 +48,8 @@ fn create_wasi( }; // Set n_batch to a fixed value of 128. - let batch_size = if load_model.n_batch > 0 { - Some(format!("{}", load_model.n_batch)) + let batch_size = if let Some(n_batch) = load_model.n_batch { + Some(format!("{}", n_batch)) } else { Some("128".to_string()) }; diff --git a/moxin-backend/src/backend_impls/mod.rs b/moxin-backend/src/backend_impls/mod.rs index 152f2bfc..9b3ca55f 100644 --- a/moxin-backend/src/backend_impls/mod.rs +++ b/moxin-backend/src/backend_impls/mod.rs @@ -139,8 +139,8 @@ fn test_chat() { rope_freq_scale: 0.0, rope_freq_base: 0.0, context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit, - n_batch: 128, - n_ctx: 1024, + n_batch: Some(128), + n_ctx: Some(1024), }, tx, ); @@ -211,8 +211,8 @@ fn test_chat_stop() { prompt_template: None, gpu_layers: moxin_protocol::protocol::GPULayers::Max, use_mlock: false, - n_batch: 128, - n_ctx: 1024, + n_batch: Some(128), + n_ctx: Some(1024), rope_freq_scale: 0.0, rope_freq_base: 0.0, context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit, diff --git a/moxin-protocol/src/protocol.rs b/moxin-protocol/src/protocol.rs index cfded354..4c576773 100644 --- a/moxin-protocol/src/protocol.rs +++ b/moxin-protocol/src/protocol.rs @@ -28,10 +28,8 @@ pub struct LoadModelOptions { pub prompt_template: Option, pub gpu_layers: GPULayers, pub use_mlock: bool, - // if 0, the backend will use the default value - pub n_batch: u32, - // if 0, the backend will use the default value - pub n_ctx: u32, + pub n_batch: Option, + pub n_ctx: Option, pub rope_freq_scale: f32, pub rope_freq_base: f32, // TBD Not really sure if this is something backend manages or if it is matter of diff --git a/src/data/chats/model_loader.rs b/src/data/chats/model_loader.rs index e16279af..b43477dd 100644 --- a/src/data/chats/model_loader.rs +++ b/src/data/chats/model_loader.rs @@ -34,8 +34,8 @@ impl ModelLoader { rope_freq_base: 0.0, context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit, - n_batch: 0, - n_ctx: 0, + n_batch: None, + n_ctx: None, }, tx, );