From 1b4db354cc660dd333403c39c9507c5e5b985b60 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 5 Apr 2025 19:59:17 -0700 Subject: [PATCH 01/12] llama4 support --- llm/llama-4/README.md | 162 ++++++++++++++++++++++++++++++++++++++++ llm/llama-4/llama4.yaml | 27 +++++++ 2 files changed, 189 insertions(+) create mode 100644 llm/llama-4/README.md create mode 100644 llm/llama-4/llama4.yaml diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md new file mode 100644 index 00000000000..c2d612cd981 --- /dev/null +++ b/llm/llama-4/README.md @@ -0,0 +1,162 @@ + + +# Run Llama 4 on Kubernetes or Any Cloud + + + + +[Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025. + + +## Prerequisites + +- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/) and request access to the model [meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). +- Check that `sky check` shows clouds or Kubernetes are enabled. + +## SkyPilot YAML + +
+Click to see the full recipe YAML + +```yaml +envs: + MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + +resources: + accelerators: { H100:8, H200:8, B100:8, B200:8, GB200:8 } + cpus: 32+ + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +setup: | + uv pip install vllm>0.8.2 + +run: | + echo 'Starting vllm api server...' + + vllm serve $MODEL_NAME \ + --port 8081 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 430000 + +``` + +Wait until the model is ready (this can take 10+ minutes). + +🎉 **Congratulations!** 🎉 You have now launched the Llama 4 Maverick Instruct LLM on your infra. + +### Chat with Llama 4 Maverick with OpenAI API + +To curl `/v1/chat/completions`: +```console +ssh -L 8081:localhost:8081 llama4 + +curl http://localhost:8081/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ] + }' | jq . +``` +Example outputs: +```console +... +``` + +To stop the instance: +```console +sky stop llama4 +``` + +To shut down all resources: +```console +sky down llama4 +``` + +## Serving Llama-4: scaling up with SkyServe + + +With no change to the YAML, launch a fully managed service on your infra: +```console +HF_TOKEN=xxx sky serve up llama4.yaml -n llama4 --env HF_TOKEN +``` + +Wait until the service is ready: +```console +watch -n10 sky serve status llama4 +``` + +
+Example outputs: + +```console +Services +NAME VERSION UPTIME STATUS REPLICAS ENDPOINT +llama4 1 35s READY 2/2 xx.yy.zz.100:30001 + +Service Replicas +SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION +llama4 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'H100': 8}) READY us-east4 +llama4 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'H100': 8}) READY us-east4 +``` +
+ + +Get a single endpoint that load-balances across replicas: +```console +ENDPOINT=$(sky serve status --endpoint llama4) +``` + +> **Tip:** SkyServe fully manages the lifecycle of your replicas. For example, if a spot replica is preempted, the controller will automatically replace it. This significantly reduces the operational burden while saving costs. + +To curl the endpoint: +```console +curl http://$ENDPOINT/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer token' \ + --data '{ + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "messages": [ + { + "role": "user", + "content": [ + {"type" : "text", "text": "Covert this logo to ASCII art"}, + {"type": "image_url", "image_url": {"url": "https://pbs.twimg.com/profile_images/1584596138635632640/HWexMoH5_400x400.jpg"}} + ] + }], + "max_tokens": 2048 + }' | jq . +``` + +To shut down all resources: +```console +sky serve down llama4 +``` + +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). + diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml new file mode 100644 index 00000000000..f811ec43d0d --- /dev/null +++ b/llm/llama-4/llama4.yaml @@ -0,0 +1,27 @@ +envs: + MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + +resources: + accelerators: { H100:8, H100:12, H200:8, GH200:8, B100:8, B200:8, GB200:8 } + cpus: 32+ + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 + +setup: | + # uv pip install vllm>0.8.2 + git clone https://github.com/houseroad/vllm.git + cd vllm + git checkout init_pr + export VLLM_USE_PRECOMPILED=1 + pip install -e . + +run: | + echo 'Starting vllm api server...' + + vllm serve $MODEL_NAME \ + --port 8081 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 430000 From e017e0ed8a1223d01b20cc9746d0b357a15e569a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 5 Apr 2025 20:00:46 -0700 Subject: [PATCH 02/12] Add service section --- llm/llama-4/llama4.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml index f811ec43d0d..3c50bebb11c 100644 --- a/llm/llama-4/llama4.yaml +++ b/llm/llama-4/llama4.yaml @@ -25,3 +25,15 @@ run: | --port 8081 \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --max-model-len 430000 + +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 From 5838aacd087e47e516bee53ab617416c79c805ec Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 5 Apr 2025 20:09:12 -0700 Subject: [PATCH 03/12] Add llama 4 --- docs/source/compute/show-gpus-h100-8.txt | 2 +- docs/source/examples/models/index.rst | 1 + docs/source/examples/models/llama-4.md | 1 + llm/llama-4/README.md | 4 ++-- 4 files changed, 5 insertions(+), 3 deletions(-) create mode 120000 docs/source/examples/models/llama-4.md diff --git a/docs/source/compute/show-gpus-h100-8.txt b/docs/source/compute/show-gpus-h100-8.txt index 17cf51a672f..e5e91059cf4 100644 --- a/docs/source/compute/show-gpus-h100-8.txt +++ b/docs/source/compute/show-gpus-h100-8.txt @@ -9,7 +9,7 @@ H100 8 GCP a3-highgpu-8g 80GB 208 1872 H100 8 Paperspace H100x8 - 128 640GB $ 47.600 - East Coast (NY2) H100 8 DO gpu-h100x8-640gb 80GB 160 1920GB $ 47.600 - tor1 H100 8 OCI BM.GPU.H100.8 80GB 224 2048GB $ 80.000 - eu-amsterdam-1 -H100 8 AWS p5.48xlarge 80GB 192 2048GB $ 98.320 $ 13.127 us-east-2 +H100 8 AWS p5.48xlarge 80GB 192 2048GB $ 98.320 $ 9.832 us-east-2 GPU QTY CLOUD INSTANCE_TYPE DEVICE_MEM vCPUs HOST_MEM HOURLY_PRICE HOURLY_SPOT_PRICE REGION H100-MEGA 8 GCP a3-megagpu-8g 80GB 208 1872GB $ 92.214 $ 21.208 us-central1 diff --git a/docs/source/examples/models/index.rst b/docs/source/examples/models/index.rst index 6fe280ce2fb..a1bdd18a2bf 100644 --- a/docs/source/examples/models/index.rst +++ b/docs/source/examples/models/index.rst @@ -8,6 +8,7 @@ Models DeepSeek-R1 Distilled DeepSeek-Janus Gemma 3 + Llama 4 Llama 3.2 Llama 3.1 Llama 3 diff --git a/docs/source/examples/models/llama-4.md b/docs/source/examples/models/llama-4.md new file mode 120000 index 00000000000..fd8197c8f79 --- /dev/null +++ b/docs/source/examples/models/llama-4.md @@ -0,0 +1 @@ +../../../../llm/llama-4/README.md \ No newline at end of file diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index c2d612cd981..567e0b5e3b8 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -2,7 +2,7 @@ # Run Llama 4 on Kubernetes or Any Cloud - + [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025. @@ -61,7 +61,7 @@ Wait until the model is ready (this can take 10+ minutes). 🎉 **Congratulations!** 🎉 You have now launched the Llama 4 Maverick Instruct LLM on your infra. -### Chat with Llama 4 Maverick with OpenAI API +### Chat with Llama 4 with OpenAI API To curl `/v1/chat/completions`: ```console From 640d3bcacf3f6de323f0ea3cdcc60754559cb075 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 00:40:14 -0700 Subject: [PATCH 04/12] use 0.8.3 for vllm --- llm/llama-4/llama4.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml index 3c50bebb11c..52426af0bd9 100644 --- a/llm/llama-4/llama4.yaml +++ b/llm/llama-4/llama4.yaml @@ -11,12 +11,7 @@ resources: ports: 8081 setup: | - # uv pip install vllm>0.8.2 - git clone https://github.com/houseroad/vllm.git - cd vllm - git checkout init_pr - export VLLM_USE_PRECOMPILED=1 - pip install -e . + uv pip install "vllm>=0.8.3" run: | echo 'Starting vllm api server...' From fe35563e32fafd1f2219b1e1de50a0b0b2ceed2d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 01:26:48 -0700 Subject: [PATCH 05/12] minor readme fix --- llm/llama-4/README.md | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index 567e0b5e3b8..cfc4d855e7a 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -16,27 +16,12 @@ ## SkyPilot YAML -
-Click to see the full recipe YAML - ```yaml envs: MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. -service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - resources: accelerators: { H100:8, H200:8, B100:8, B200:8, GB200:8 } cpus: 32+ @@ -65,9 +50,9 @@ Wait until the model is ready (this can take 10+ minutes). To curl `/v1/chat/completions`: ```console -ssh -L 8081:localhost:8081 llama4 +ENDPOINT=$(sky status --endpoint 8081 llama4) -curl http://localhost:8081/v1/chat/completions \ +curl http://$ENDPOINT/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", @@ -83,10 +68,6 @@ curl http://localhost:8081/v1/chat/completions \ ] }' | jq . ``` -Example outputs: -```console -... -``` To stop the instance: ```console @@ -116,13 +97,13 @@ watch -n10 sky serve status llama4 ```console Services -NAME VERSION UPTIME STATUS REPLICAS ENDPOINT -llama4 1 35s READY 2/2 xx.yy.zz.100:30001 +NAME VERSION UPTIME STATUS REPLICAS ENDPOINT +llama4 1 35s READY 2/2 xx.yy.zz.100:30001 Service Replicas -SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION -llama4 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'H100': 8}) READY us-east4 -llama4 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'H100': 8}) READY us-east4 +SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION +llama4 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'H100': 8}) READY us-east4 +llama4 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'H100': 8}) READY us-east4 ```
From e11dc185b7b7fa3a3dead00f094beeb52d09bc74 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 01:30:30 -0700 Subject: [PATCH 06/12] fix input --- llm/llama-4/README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index cfc4d855e7a..daced3b28ea 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -118,20 +118,20 @@ ENDPOINT=$(sky serve status --endpoint llama4) To curl the endpoint: ```console curl http://$ENDPOINT/v1/chat/completions \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer token' \ - --data '{ - "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", - "messages": [ - { - "role": "user", - "content": [ - {"type" : "text", "text": "Covert this logo to ASCII art"}, - {"type": "image_url", "image_url": {"url": "https://pbs.twimg.com/profile_images/1584596138635632640/HWexMoH5_400x400.jpg"}} - ] - }], - "max_tokens": 2048 - }' | jq . + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ] + }' | jq . ``` To shut down all resources: From e1c18f0550c4da85d2a3ff2bc8b41a32b0a7d29a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 01:34:04 -0700 Subject: [PATCH 07/12] Add video --- llm/llama-4/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index daced3b28ea..391a768d188 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -7,6 +7,7 @@ [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025. +https://github.com/user-attachments/assets/7519d98a-f10f-4671-8c2b-a2fe36a6789d ## Prerequisites From 5fec58cadad0b698239cfdf344ed45935d194bf6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 01:41:25 -0700 Subject: [PATCH 08/12] Add video --- llm/llama-4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index 391a768d188..089cc42b556 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -7,7 +7,7 @@ [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025. -https://github.com/user-attachments/assets/7519d98a-f10f-4671-8c2b-a2fe36a6789d +https://github.com/user-attachments/assets/4cd40fde-6418-4722-94db-372edf50eb5e ## Prerequisites From 8c54cd14bb28d1fb67a498013c2e591d4afcd2b9 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 11:34:13 -0700 Subject: [PATCH 09/12] Update vllm version in README.md --- llm/llama-4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index 089cc42b556..f199922132c 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -31,7 +31,7 @@ resources: ports: 8081 # Expose to internet traffic. setup: | - uv pip install vllm>0.8.2 + uv pip install vllm==0.8.3 run: | echo 'Starting vllm api server...' From 326d90183230438ac80231737da272d6b5ce77cc Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 11:43:00 -0700 Subject: [PATCH 10/12] Update video --- llm/llama-4/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index f199922132c..a69bbd41192 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -7,7 +7,7 @@ [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025. -https://github.com/user-attachments/assets/4cd40fde-6418-4722-94db-372edf50eb5e +https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded ## Prerequisites From a422415c3fec651bde7da353f2d91f5552364dfe Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 11:55:21 -0700 Subject: [PATCH 11/12] Update readme --- llm/llama-4/README.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md index a69bbd41192..63d0904e900 100644 --- a/llm/llama-4/README.md +++ b/llm/llama-4/README.md @@ -7,7 +7,8 @@ [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025. -https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded +![](https://i.imgur.com/kjqLX87.png) + ## Prerequisites @@ -15,12 +16,19 @@ https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded - Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. -## SkyPilot YAML +## Run Llama 4 +```bash +sky launch llama4.yaml -c llama4 --env HF_TOKEN +``` + +https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded + + +The `llama4.yaml` file is as follows: ```yaml envs: MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. resources: @@ -43,7 +51,11 @@ run: | ``` -Wait until the model is ready (this can take 10+ minutes). +You can use other models by setting different `MODEL_NAME`. +```bash +sky launch llama4.yaml -c llama4 --env HF_TOKEN --env MODEL_NAME=meta-llama/Llama-4-Scout-17B-16E-Instruct +``` + 🎉 **Congratulations!** 🎉 You have now launched the Llama 4 Maverick Instruct LLM on your infra. From 9de9ed23bcdb27213a938cb29b69082f1e13ea6e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 6 Apr 2025 14:10:21 -0700 Subject: [PATCH 12/12] update accelerators --- llm/llama-4/llama4.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml index 52426af0bd9..bdce6080b3a 100644 --- a/llm/llama-4/llama4.yaml +++ b/llm/llama-4/llama4.yaml @@ -4,7 +4,7 @@ envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. resources: - accelerators: { H100:8, H100:12, H200:8, GH200:8, B100:8, B200:8, GB200:8 } + accelerators: { H100:8, H200:8, B100:8, B200:8, GB200:8 } cpus: 32+ disk_size: 512 # Ensure model checkpoints can fit. disk_tier: best