moxin-org · jmbejar · Aug 12, 2024 · Jul 30, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -30,7 +30,7 @@ moxin-fake-backend = { path = "moxin-fake-backend" }
 
 makepad-widgets = { git = "https://github.com/jmbejar/makepad", branch = "moxin-release-v1" }
 
-robius-open = "0.1.0"
+robius-open = "0.1.1"
 robius-url-handler = { git = "https://github.com/project-robius/robius-url-handler" }
 
 chrono = "0.4"

diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ The following table shows which host systems can currently be used to build Moxi
 | ------- | --------------- | ------- | ----- | -------------------------------------------- |
 | macOS   | macOS           | ✅      | ✅    | `.app`, [`.dmg`]                             |
 | Linux   | Linux           | ✅      | ✅    | [`.deb` (Debian dpkg)], [AppImage], [pacman] |
+| Windows | Windows (10+)   | ✅      | ✅    | `.exe` (NSIS)                                |
 
 ## Building and Running
 
@@ -41,6 +42,9 @@ curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/insta
 source $HOME/.wasmedge/env
 ```
 
+> [!IMPORTANT]
+> If your CPU does not support AVX512, then you should append the `--noavx` option onto the above command.
+
 To build Moxin on Linux, you must install the following dependencies:
 `openssl`, `clang`/`libclang`, `binfmt`, `Xcursor`/`X11`, `asound`/`pulse`.
 
@@ -64,6 +68,15 @@ cargo run --release
 
 2. Restart your PC, or log out and log back in, which allows the LLVM path to be properly
     * Alternatively you can add the LLVM path `C:\Program Files\LLVM\bin` to your system PATH.
+
+
+> [!TIP]
+> To automatically handle Steps 3 and 4, simply run:
+> ```sh
+> cargo run -p moxin-runner -- --install
+> ```
+
+
 3.  Download the [WasmEdge-0.14.0-windows.zip](https://github.com/WasmEdge/WasmEdge/releases/download/0.14.0/WasmEdge-0.14.0-windows.zip) file from [the WasmEdge v0.14.0 release page](https://github.com/WasmEdge/WasmEdge/releases/tag/0.14.0),
     and then extract it into a directory of your choice.
     We recommend using your home directory (e.g., `C:\Users\<USERNAME>\`), represented by `$home` in powershell and `%homedrive%%homepath%` in batch-cmd.
@@ -78,18 +91,23 @@ cargo run --release
     $ProgressPreference = 'Continue' ## restore default progress bars
     ```
 
-4. Download the WasmEdge WASI-NN plugin here: [WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip](https://github.com/WasmEdge/WasmEdge/releases/download/0.14.0/WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip) (15.5MB) and extract it to the same directory as above, e.g., `C:\Users\<USERNAME>\WasmEdge-0.14.0-Windows`.
+4. Download [the appropriate WasmEdge WASI-NN plugin](https://github.com/second-state/WASI-NN-GGML-PLUGIN-REGISTRY/releases/tag/b3499) (see below for details), extract/unzip it, and copy the `lib\wasmedge` directory from the .zip archive into the `lib\` directory of the above WasmEdge installation directory, e.g., `C:\Users\<USERNAME>\WasmEdge-0.14.0-Windows\lib`.
+
 > [!IMPORTANT]
-> You will be asked whether you want to replace the files that already exist; select `Replace the files in the destination` when doing so.    
-* To do this quickly in powershell:
-    ```powershell
-    $ProgressPreference = 'SilentlyContinue' ## makes downloads much faster
-    Invoke-WebRequest -Uri "https://github.com/WasmEdge/WasmEdge/releases/download/0.14.0/WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip" -OutFile "WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip"
-    Expand-Archive -Force -LiteralPath "WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip" -DestinationPath "$home\WasmEdge-0.14.0-Windows"
-    $ProgressPreference = 'Continue' ## restore default progress bars
-    ```
+> The only file that matters is the plugin file, which must exist at the path `WasmEdge-0.14.0-Windows\lib\wasmedge\wasmedgePluginWasiNN.dll`
 
+* If your computer has a CUDA v12-capable GPU, select [WasmEdge-plugin-wasi_nn-ggml-cuda-0.14.0-windows_x86_64.zip](https://github.com/second-state/WASI-NN-GGML-PLUGIN-REGISTRY/releases/download/b3499/WasmEdge-plugin-wasi_nn-ggml-cuda-0.14.0-windows_x86_64.zip).
+  * Note that **CUDA version 12** is required.
+* If your computer doesn't have CUDA 12, then select either:
+  * [WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip](https://github.com/second-state/WASI-NN-GGML-PLUGIN-REGISTRY/releases/download/b3499/WasmEdge-plugin-wasi_nn-ggml-0.14.0-windows_x86_64.zip) if your CPU supports AVX-512, or
+  * [WasmEdge-plugin-wasi_nn-ggml-noavx-0.14.0-windows_x86_64.zip](https://github.com/second-state/WASI-NN-GGML-PLUGIN-REGISTRY/releases/tag/b3499#:~:text=WasmEdge%2Dplugin%2Dwasi_nn%2Dggml%2Dnoavx%2D0.14.0%2Dwindows_x86_64.zip) if your CPU does *not* support AVX-512.
+
+
 5. Set the `WASMEDGE_DIR` and `WASMEDGE_PLUGIN_PATH` environment variables to point to the `WasmEdge-0.14.0-Windows` directory that you extracted above, and then build Moxin.
+
+> [!IMPORTANT]
+> You may also need to add the `WasmEdge-0.14.0-Windows\bin` directory to your `PATH` environment variable (on some versions of Windows).
+
     In powershell, you can do this like so:
     ```powershell
     $env:WASMEDGE_DIR="$home\WasmEdge-0.14.0-Windows\"

diff --git a/moxin-backend/src/backend_impls/api_server.rs b/moxin-backend/src/backend_impls/api_server.rs
@@ -23,6 +23,7 @@ static WASM: &[u8] = include_bytes!("../../wasm/llama-api-server.wasm");
 pub struct LLamaEdgeApiServer {
     id: String,
     listen_addr: SocketAddr,
+    load_model_options: LoadModelOptions,
     wasm_module: Module,
     running_controller: tokio::sync::broadcast::Sender<()>,
     #[allow(dead_code)]
@@ -35,15 +36,23 @@ fn create_wasi(
     load_model: &LoadModelOptions,
 ) -> wasmedge_sdk::WasmEdgeResult<WasiModule> {
     // use model metadata context size
-    let ctx_size = Some(format!("{}", file.context_size.min(8 * 1024)));
+    let ctx_size = if let Some(n_ctx) = load_model.n_ctx {
+        Some(format!("{}", n_ctx))
+    } else {
+        Some(format!("{}", file.context_size.min(8 * 1024)))
+    };
 
     let n_gpu_layers = match load_model.gpu_layers {
         moxin_protocol::protocol::GPULayers::Specific(n) => Some(n.to_string()),
         moxin_protocol::protocol::GPULayers::Max => None,
     };
 
     // Set n_batch to a fixed value of 128.
-    let batch_size = Some(format!("128"));
+    let batch_size = if let Some(n_batch) = load_model.n_batch {
+        Some(format!("{}", n_batch))
+    } else {
+        Some("128".to_string())
+    };
 
     let mut prompt_template = load_model.prompt_template.clone();
     if prompt_template.is_none() && !file.prompt_template.is_empty() {
@@ -133,17 +142,23 @@ impl BackendModel for LLamaEdgeApiServer {
         options: moxin_protocol::protocol::LoadModelOptions,
         tx: std::sync::mpsc::Sender<anyhow::Result<moxin_protocol::protocol::LoadModelResponse>>,
     ) -> Self {
+        let load_model_options = options.clone();
         let mut need_reload = true;
         let (wasm_module, listen_addr) = if let Some(old_model) = &old_model {
-            if old_model.id == file.id.as_str() {
+            if old_model.id == file.id.as_str()
+                && old_model.load_model_options.n_ctx == options.n_ctx
+                && old_model.load_model_options.n_batch == options.n_batch
+            {
                 need_reload = false;
             }
             (old_model.wasm_module.clone(), old_model.listen_addr)
         } else {
-            (
-                Module::from_bytes(None, WASM).unwrap(),
-                ([0, 0, 0, 0], 8080).into(),
-            )
+            let new_addr = std::net::TcpListener::bind("localhost:0")
+                .unwrap()
+                .local_addr()
+                .unwrap();
+
+            (Module::from_bytes(None, WASM).unwrap(), new_addr)
         };
 
         if !need_reload {
@@ -152,6 +167,7 @@ impl BackendModel for LLamaEdgeApiServer {
                     file_id: file.id.to_string(),
                     model_id: file.model_id,
                     information: "".to_string(),
+                    listen_port: listen_addr.port(),
                 },
             )));
             return old_model.unwrap();
@@ -165,7 +181,8 @@ impl BackendModel for LLamaEdgeApiServer {
 
         let file_id = file.id.to_string();
 
-        let url = format!("http://localhost:{}/echo", listen_addr.port());
+        let listen_port = listen_addr.port();
+        let url = format!("http://localhost:{}/echo", listen_port);
 
         let file_ = file.clone();
 
@@ -197,6 +214,7 @@ impl BackendModel for LLamaEdgeApiServer {
                         file_id: file_.id.to_string(),
                         model_id: file_.model_id,
                         information: "".to_string(),
+                        listen_port,
                     },
                 )));
             } else {
@@ -212,6 +230,7 @@ impl BackendModel for LLamaEdgeApiServer {
             listen_addr,
             running_controller,
             model_thread,
+            load_model_options,
         };
 
         new_model

diff --git a/moxin-backend/src/backend_impls/chat_ui.rs b/moxin-backend/src/backend_impls/chat_ui.rs
@@ -228,6 +228,7 @@ fn get_input(
                     file_id,
                     model_id,
                     information: String::new(),
+                    listen_port: 0,
                 })));
             }
 
@@ -430,6 +431,7 @@ impl super::BackendModel for ChatBotModel {
                 file_id: file.id.to_string(),
                 model_id: file.model_id,
                 information: "".to_string(),
+                listen_port: 0,
             })));
             return old_model.unwrap();
         }

diff --git a/moxin-backend/src/backend_impls/mod.rs b/moxin-backend/src/backend_impls/mod.rs
@@ -139,6 +139,8 @@ fn test_chat() {
             rope_freq_scale: 0.0,
             rope_freq_base: 0.0,
             context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit,
+            n_batch: Some(128),
+            n_ctx: Some(1024),
         },
         tx,
     );
@@ -209,6 +211,8 @@ fn test_chat_stop() {
             prompt_template: None,
             gpu_layers: moxin_protocol::protocol::GPULayers::Max,
             use_mlock: false,
+            n_batch: Some(128),
+            n_ctx: Some(1024),
             rope_freq_scale: 0.0,
             rope_freq_base: 0.0,
             context_overflow_policy: moxin_protocol::protocol::ContextOverflowPolicy::StopAtLimit,

diff --git a/moxin-protocol/src/open_ai.rs b/moxin-protocol/src/open_ai.rs
@@ -106,6 +106,7 @@ pub struct ChatResponseData {
     pub choices: Vec<ChoiceData>,
     pub created: u32,
     pub model: ModelID,
+    #[serde(default)]
     pub system_fingerprint: String,
     pub usage: UsageData,
 

diff --git a/moxin-protocol/src/protocol.rs b/moxin-protocol/src/protocol.rs
@@ -28,9 +28,10 @@ pub struct LoadModelOptions {
     pub prompt_template: Option<String>,
     pub gpu_layers: GPULayers,
     pub use_mlock: bool,
+    pub n_batch: Option<u32>,
+    pub n_ctx: Option<u32>,
     pub rope_freq_scale: f32,
     pub rope_freq_base: f32,
-
     // TBD Not really sure if this is something backend manages or if it is matter of
     // the client (if it is done by tweaking the JSON payload for the chat completition)
     pub context_overflow_policy: ContextOverflowPolicy,
@@ -41,6 +42,10 @@ pub struct LoadedModelInfo {
     pub file_id: FileID,
     pub model_id: ModelID,
 
+    // The port where the local server is listening for the model.
+    // if 0, the server is not running.
+    pub listen_port: u16,
+
     // JSON formatted string with the model information. See "Model Inspector" in LMStudio.
     pub information: String,
 }