From d09543da372ff06486edc9e576c9fdd1912b0740 Mon Sep 17 00:00:00 2001 From: Zhang Tianyang Date: Mon, 5 Aug 2024 21:37:52 +0800 Subject: [PATCH] wasm: feat: support WasmEdge wasi_nn plugin with llm application Signed-off-by: Zhang Tianyang --- .github/workflows/ci.yml | 3 + .../How-to-run-Llama-3-8B-with-Kubernetes.md | 140 ++++++++++++++++++ wasm/src/wasmedge.rs | 85 +++++++++-- 3 files changed, 219 insertions(+), 9 deletions(-) create mode 100644 docs/wasm/How-to-run-Llama-3-8B-with-Kubernetes.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3f70ad97..30311cf8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,9 @@ jobs: - directories: wasm features: --features=wasmedge wasmEdge: 0.13.5 + - directories: wasm + features: --features="wasmedge, wasmedge_wasi_nn" + wasmEdge: 0.13.5 - directories: wasm features: --features=wasmtime runs-on: ubuntu-latest diff --git a/docs/wasm/How-to-run-Llama-3-8B-with-Kubernetes.md b/docs/wasm/How-to-run-Llama-3-8B-with-Kubernetes.md new file mode 100644 index 00000000..381c4947 --- /dev/null +++ b/docs/wasm/How-to-run-Llama-3-8B-with-Kubernetes.md @@ -0,0 +1,140 @@ +# How to run a Llama-3-8B inference application in Kubernetes? + +## What is LlamaEdge? + +The [LlamaEdge](https://github.com/LlamaEdge/LlamaEdge) project makes it easy for you to run LLM inference apps and +create OpenAI-compatible API services for the Llama3 series of LLMs locally. + +With WasmEdge, you can create and deploy very fast and very lightweight LLM inference applications, see +details in: https://www.secondstate.io/articles/wasm-runtime-agi/. + +## How to run a llm inference application in Kuasar? + +Since Kuasar v0.8.0, Kuasar wasm-sandboxer with `wasmedge` and `wasmedge_wasi_nn` +features allows your WasmEdge application use the ability of WASI API for +performing Machine Learning inference: https://github.com/WebAssembly/wasi-nn. + +This article is inspired by [Getting Started with Llama-3-8B](https://www.secondstate.io/articles/llama-3-8b/), +which introducing how to create an OpenAI-compatible API service for Llama-3-8B. + +### Prerequisites + ++ Install WasmEdge and plugins: +`curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- -v 0.13.5 --plugins wasi_logging wasi_nn-ggml` + + +### 1. Build docker image + +We already have an example docker image on dockerhub: `docker.io/kuasario/llama-api-server:v1`. +Follow this if you want to build your own docker image with the llm applications, model and other requires. + ++ Download the Llama-3-8B model GGUF file: Since the size of the model is 5.73 GB,it could take a while to download. +`curl -LO https://huggingface.co/second-state/Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf`. + ++ Get your LlamaEdge app: Take the api-server as example, download it by +`curl -LO https://github.com/LlamaEdge/LlamaEdge/releases/latest/download/llama-api-server.wasm`. +It is a web server providing an OpenAI-compatible API service, as well as an optional web UI, for llama3 models. + ++ Download the chatbot web UI to interact with the model with a chatbot UI: +```bash + curl -LO https://github.com/LlamaEdge/chatbot-ui/releases/latest/download/chatbot-ui.tar.gz + tar xzf chatbot-ui.tar.gz + rm chatbot-ui.tar.gz +``` + ++ Build it! Here is an example DOCKERFILE: +```dockerfile +FROM scratch +COPY . / +CMD ["llama-api-server.wasm", "--prompt-template", "llama-3-chat", "--ctx-size", "4096", "--model-name", "Llama-3-8B", "--log-all"] +``` +Build it with `docker build -t docker.io/kuasario/llama-api-server:v1 .` + +### 2. Build and run Kuasar Wasm Sandboxer + +```bash +git clone https://github.com/kuasar-io/kuasar.git +cd kuasar/wasm +cargo run --features="wasmedge, wasmedge_wasi_nn" -- --listen /run/wasm-sandboxer.sock --dir /run/kuasar-wasm +``` + +### 3. Config and containerd +Add the following sandboxer config in the containerd config file `/etc/containerd/config.toml` +```toml +[proxy_plugins] + [proxy_plugins.wasm] + type = "sandbox" + address = "/run/wasm-sandboxer.sock" + +[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.kuasar-wasm] + runtime_type = "io.containerd.kuasar-wasm.v1" + sandboxer = "wasm" +``` + +Then, build and run containerd with the environ variable `ENABLE_CRI_SANDBOXES=1`. + +### 4. Create Kuasar wasm runtime + +Suppose we are in a kubernetes cluster, all the workloads are managed by kubernetes. So how to let container +engine(containerd) know which runtime the workload should run in? + +[Container Runtimes](https://kubernetes.io/docs/setup/production-environment/container-runtimes/) is designed for launching and +running containers in Kubernetes. Thus, you should create a new container runtime `kubectl apply -f kuasar-wasm-runtimeclass.yaml`. +```yaml + apiVersion: node.k8s.io/v1 + handler: kuasar-wasm + kind: RuntimeClass + metadata: + name: kuasar-wasm +``` + +OK, the container show know what is `kuasar-wasm` ruintime. + +### 5. Deploy your llm workload + +The last thing is to deploy the llm workload, you can use the docker image in the step 1. + +Run `kubectl apply llama-deploy.yaml` + +Here is an example deploy.yaml +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llama + labels: + app: llama +spec: + replicas: 2 + selector: + matchLabels: + app: llama + template: + metadata: + labels: + app: llama + spec: + containers: + - command: + - llama-api-server.wasm + args: ["--prompt-template", "llama-3-chat", "--ctx-size", "4096", "--model-name", "Llama-3-8B"] + env: + - name: io.kuasar.wasm.nn_preload + value: default:GGML:AUTO:Meta-Llama-3-8B-Instruct-Q5_K_M.gguf + image: docker.io/kuasario/llama-api-server:v1 + name: llama-api-server + runtimeClassName: kuasar-wasm +``` +Make sure the `runtimeClassName` is the right runtime created in the last step 4. + +Please note that we define an env `io.kuasar.wasm.nn_preload`, which will tell kuasar what will be loaded in `wasi_nn` +plugin. Normally including the alias of model, the inference backend, the execution target and the model file. + +## Extension: Try with Kubernetes Service + +In Kubernetes, a [Service](https://kubernetes.io/docs/concepts/services-networking/service/) is a method for exposing a +network application that is running as one or more Pods in your cluster. + +You can create a ClusterIP Service or LoadBalancer Service or whatever you want, and access llm service from outer cluster. + +We do not provide examples since it has nothing to do with Kuasar! diff --git a/wasm/src/wasmedge.rs b/wasm/src/wasmedge.rs index 1249a502..dd5b5a39 100644 --- a/wasm/src/wasmedge.rs +++ b/wasm/src/wasmedge.rs @@ -87,14 +87,6 @@ impl Default for WasmEdgeContainerFactory { PluginManager::load(None).unwrap(); let mut host_options = HostRegistrationConfigOptions::default(); host_options = host_options.wasi(true); - #[cfg(all( - target_os = "linux", - feature = "wasmedge_wasi_nn", - target_arch = "x86_64" - ))] - { - host_options = host_options.wasi_nn(true); - } let config = ConfigBuilder::new(CommonConfigOptions::default()) .with_host_registration_config(host_options) .build() @@ -161,7 +153,10 @@ impl ContainerFactory for WasmEdgeContainerFactory { impl ProcessLifecycle for WasmEdgeInitLifecycle { async fn start(&self, p: &mut InitProcess) -> containerd_shim::Result<()> { let spec = &p.lifecycle.spec; - let vm = p.lifecycle.prototype_vm.clone(); + // Allow vm to be mutable since we change it in wasmedge_wasi_nn feature + #[allow(unused_mut)] + #[allow(unused_assignments)] + let mut vm = p.lifecycle.prototype_vm.clone(); let args = get_args(spec); let envs = get_envs(spec); let rootfs = get_rootfs(spec).ok_or_else(|| { @@ -198,6 +193,45 @@ impl ProcessLifecycle for WasmEdgeInitLifecycle { format!("failed to add task to cgroup: {}", cgroup_path) ))?; } + // Only create new VM instance on wasmedge_wasi_nn feature + #[cfg(all( + target_os = "linux", + feature = "wasmedge_wasi_nn", + target_arch = "x86_64" + ))] + { + const NN_PRELOAD_KEY: &str = "io.kuasar.wasm.nn_preload"; + if let Some(process) = p.lifecycle.spec.process() { + if let Some(env) = process.env() { + if let Some(v) = + env.iter().find(|k| k.contains(&NN_PRELOAD_KEY.to_string())) + { + if let Some(nn_preload) = + v.strip_prefix::<&str>(format!("{}=", NN_PRELOAD_KEY).as_ref()) + { + log::info!("found nn_pre_load: {}", nn_preload); + if let Some(rootfs) = spec.root().as_ref() { + pre_load_with_new_rootfs(nn_preload, rootfs.path()) + .unwrap(); + } + } + } + } + } + + let host_options = HostRegistrationConfigOptions::default().wasi(true); + let config = ConfigBuilder::new(CommonConfigOptions::default()) + .with_host_registration_config(host_options) + .build() + .map_err(other_error!(e, "generate default wasmedge config"))?; + + vm = VmBuilder::new() + .with_config(config) + .with_plugin_wasi_nn() + .with_plugin("wasi_logging", None) + .build() + .unwrap(); + } match run_wasi_func(vm, args, envs, preopens, p) { Ok(_) => exit(0), // TODO add a pipe? to return detailed error message @@ -461,3 +495,36 @@ pub async fn process_exits(task: &TaskService) { } }); } + +#[cfg(all( + target_os = "linux", + feature = "wasmedge_wasi_nn", + target_arch = "x86_64" +))] +fn pre_load_with_new_rootfs( + preload: &str, + rootfs: &std::path::PathBuf, +) -> Result<(), WasmEdgeError> { + use wasmedge_sdk::plugin::{ExecutionTarget, GraphEncoding}; + let nn_preload: Vec<&str> = preload.split(':').collect(); + if nn_preload.len() != 4 { + return Err(WasmEdgeError::Operation(format!( + "Failed to convert to NNPreload value. Invalid preload string: {}. The correct format is: 'alias:backend:target:path'", + preload + ))); + } + let (alias, backend, target, path) = ( + nn_preload[0].to_string(), + nn_preload[1] + .parse::() + .map_err(|err| WasmEdgeError::Operation(err.to_string()))?, + nn_preload[2] + .parse::() + .map_err(|err| WasmEdgeError::Operation(err.to_string()))?, + std::path::Path::new(rootfs).join(nn_preload[3]), + ); + PluginManager::nn_preload(vec![wasmedge_sdk::plugin::NNPreload::new( + alias, backend, target, path, + )]); + Ok(()) +}