From 1b4db354cc660dd333403c39c9507c5e5b985b60 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sat, 5 Apr 2025 19:59:17 -0700
Subject: [PATCH 01/12] llama4 support

---
 llm/llama-4/README.md   | 162 ++++++++++++++++++++++++++++++++++++++++
 llm/llama-4/llama4.yaml |  27 +++++++
 2 files changed, 189 insertions(+)
 create mode 100644 llm/llama-4/README.md
 create mode 100644 llm/llama-4/llama4.yaml
diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
new file mode 100644
index 00000000000..c2d612cd981
--- /dev/null
+++ b/llm/llama-4/README.md
@@ -0,0 +1,162 @@
+
+<!-- $REMOVE -->
+# Run Llama 4 on Kubernetes or Any Cloud
+<!-- $END_REMOVE -->
+<!-- $UNCOMMENT# Vision Llama 3.2 -->
+
+
+[Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025.
+
+
+## Prerequisites
+
+- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/) and request access to the model [meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8).
+- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)).
+- Check that `sky check` shows clouds or Kubernetes are enabled.
+
+## SkyPilot YAML
+
+<details>
+<summary>Click to see the full recipe YAML</summary>
+
+```yaml
+envs:
+  MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1
+
+resources:
+  accelerators: { H100:8, H200:8, B100:8, B200:8, GB200:8 }
+  cpus: 32+
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+setup: |
+  uv pip install vllm>0.8.2
+
+run: |
+  echo 'Starting vllm api server...'
+
+  vllm serve $MODEL_NAME \
+    --port 8081 \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    --max-model-len 430000
+
+```
+
+Wait until the model is ready (this can take 10+ minutes).
+
+🎉 **Congratulations!** 🎉 You have now launched the Llama 4 Maverick Instruct LLM on your infra.
+
+### Chat with Llama 4 Maverick with OpenAI API
+
+To curl `/v1/chat/completions`:
+```console
+ssh -L 8081:localhost:8081 llama4
+
+curl http://localhost:8081/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Who are you?"
+      }
+    ]
+  }' | jq .
+```
+Example outputs:
+```console
+...
+```
+
+To stop the instance:
+```console
+sky stop llama4
+```
+
+To shut down all resources:
+```console
+sky down llama4
+```
+
+## Serving Llama-4: scaling up with SkyServe
+
+
+With no change to the YAML, launch a fully managed service on your infra:
+```console
+HF_TOKEN=xxx sky serve up llama4.yaml -n llama4 --env HF_TOKEN
+```
+
+Wait until the service is ready:
+```console
+watch -n10 sky serve status llama4
+```
+
+<details>
+<summary>Example outputs:</summary>
+
+```console
+Services
+NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+llama4  1        35s     READY   2/2       xx.yy.zz.100:30001
+
+Service Replicas
+SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                       STATUS  REGION
+llama4          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'H100': 8})  READY   us-east4
+llama4          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'H100': 8})  READY   us-east4
+```
+</details>
+
+
+Get a single endpoint that load-balances across replicas:
+```console
+ENDPOINT=$(sky serve status --endpoint llama4)
+```
+
+> **Tip:** SkyServe fully manages the lifecycle of your replicas. For example, if a spot replica is preempted, the controller will automatically replace it. This significantly reduces the operational burden while saving costs.
+
+To curl the endpoint:
+```console
+curl http://$ENDPOINT/v1/chat/completions \
+    -H 'Content-Type: application/json' \
+    -H 'Authorization: Bearer token' \
+    --data '{
+        "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type" : "text", "text": "Covert this logo to ASCII art"},
+                {"type": "image_url", "image_url": {"url": "https://pbs.twimg.com/profile_images/1584596138635632640/HWexMoH5_400x400.jpg"}}
+            ]
+        }],
+        "max_tokens": 2048
+    }' | jq .
+```
+
+To shut down all resources:
+```console
+sky serve down llama4
+```
+
+See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html).
+
diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml
new file mode 100644
index 00000000000..f811ec43d0d
--- /dev/null
+++ b/llm/llama-4/llama4.yaml
@@ -0,0 +1,27 @@
+envs:
+  MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+
+resources:
+  accelerators: { H100:8, H100:12, H200:8, GH200:8, B100:8, B200:8, GB200:8 }
+  cpus: 32+
+  disk_size: 512  # Ensure model checkpoints can fit.
+  disk_tier: best
+  ports: 8081
+
+setup: |
+  # uv pip install vllm>0.8.2
+  git clone https://github.com/houseroad/vllm.git
+  cd vllm
+  git checkout init_pr
+  export VLLM_USE_PRECOMPILED=1
+  pip install -e .
+
+run: |
+  echo 'Starting vllm api server...'
+
+  vllm serve $MODEL_NAME \
+    --port 8081 \
+    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    --max-model-len 430000

From e017e0ed8a1223d01b20cc9746d0b357a15e569a Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sat, 5 Apr 2025 20:00:46 -0700
Subject: [PATCH 02/12] Add service section

---
 llm/llama-4/llama4.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml
index f811ec43d0d..3c50bebb11c 100644
--- a/llm/llama-4/llama4.yaml
+++ b/llm/llama-4/llama4.yaml
@@ -25,3 +25,15 @@ run: |
     --port 8081 \
     --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
     --max-model-len 430000
+
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1

From 5838aacd087e47e516bee53ab617416c79c805ec Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sat, 5 Apr 2025 20:09:12 -0700
Subject: [PATCH 03/12] Add llama 4

---
 docs/source/compute/show-gpus-h100-8.txt | 2 +-
 docs/source/examples/models/index.rst    | 1 +
 docs/source/examples/models/llama-4.md   | 1 +
 llm/llama-4/README.md                    | 4 ++--
 4 files changed, 5 insertions(+), 3 deletions(-)
 create mode 120000 docs/source/examples/models/llama-4.md

diff --git a/docs/source/compute/show-gpus-h100-8.txt b/docs/source/compute/show-gpus-h100-8.txt
index 17cf51a672f..e5e91059cf4 100644
--- a/docs/source/compute/show-gpus-h100-8.txt
+++ b/docs/source/compute/show-gpus-h100-8.txt
@@ -9,7 +9,7 @@ H100  8    GCP         a3-highgpu-8g                     80GB        208    1872
 H100  8    Paperspace  H100x8                            -           128    640GB     $ 47.600      -                  East Coast (NY2)   
 H100  8    DO          gpu-h100x8-640gb                  80GB        160    1920GB    $ 47.600      -                  tor1               
 H100  8    OCI         BM.GPU.H100.8                     80GB        224    2048GB    $ 80.000      -                  eu-amsterdam-1     
-H100  8    AWS         p5.48xlarge                       80GB        192    2048GB    $ 98.320      $ 13.127           us-east-2          
+H100  8    AWS         p5.48xlarge                       80GB        192    2048GB    $ 98.320      $ 9.832            us-east-2          
 
 GPU        QTY  CLOUD  INSTANCE_TYPE  DEVICE_MEM  vCPUs  HOST_MEM  HOURLY_PRICE  HOURLY_SPOT_PRICE  REGION       
 H100-MEGA  8    GCP    a3-megagpu-8g  80GB        208    1872GB    $ 92.214      $ 21.208           us-central1  
diff --git a/docs/source/examples/models/index.rst b/docs/source/examples/models/index.rst
index 6fe280ce2fb..a1bdd18a2bf 100644
--- a/docs/source/examples/models/index.rst
+++ b/docs/source/examples/models/index.rst
@@ -8,6 +8,7 @@ Models
    DeepSeek-R1 Distilled <deepseek-r1-distilled>
    DeepSeek-Janus <deepseek-janus>
    Gemma 3 <gemma3>
+   Llama 4 <llama-4>
    Llama 3.2 <llama-3_2>
    Llama 3.1 <llama-3_1>
    Llama 3 <llama-3>
diff --git a/docs/source/examples/models/llama-4.md b/docs/source/examples/models/llama-4.md
new file mode 120000
index 00000000000..fd8197c8f79
--- /dev/null
+++ b/docs/source/examples/models/llama-4.md
@@ -0,0 +1 @@
+../../../../llm/llama-4/README.md
\ No newline at end of file
diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index c2d612cd981..567e0b5e3b8 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -2,7 +2,7 @@
 <!-- $REMOVE -->
 # Run Llama 4 on Kubernetes or Any Cloud
 <!-- $END_REMOVE -->
-<!-- $UNCOMMENT# Vision Llama 3.2 -->
+<!-- $UNCOMMENT# Llama 4 -->
 
 
 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025.
@@ -61,7 +61,7 @@ Wait until the model is ready (this can take 10+ minutes).
 
 🎉 **Congratulations!** 🎉 You have now launched the Llama 4 Maverick Instruct LLM on your infra.
 
-### Chat with Llama 4 Maverick with OpenAI API
+### Chat with Llama 4 with OpenAI API
 
 To curl `/v1/chat/completions`:
 ```console

From 640d3bcacf3f6de323f0ea3cdcc60754559cb075 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 00:40:14 -0700
Subject: [PATCH 04/12] use 0.8.3 for vllm

---
 llm/llama-4/llama4.yaml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml
index 3c50bebb11c..52426af0bd9 100644
--- a/llm/llama-4/llama4.yaml
+++ b/llm/llama-4/llama4.yaml
@@ -11,12 +11,7 @@ resources:
   ports: 8081
 
 setup: |
-  # uv pip install vllm>0.8.2
-  git clone https://github.com/houseroad/vllm.git
-  cd vllm
-  git checkout init_pr
-  export VLLM_USE_PRECOMPILED=1
-  pip install -e .
+  uv pip install "vllm>=0.8.3"
 
 run: |
   echo 'Starting vllm api server...'

From fe35563e32fafd1f2219b1e1de50a0b0b2ceed2d Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 01:26:48 -0700
Subject: [PATCH 05/12] minor readme fix

---
 llm/llama-4/README.md | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index 567e0b5e3b8..cfc4d855e7a 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -16,27 +16,12 @@
 
 ## SkyPilot YAML
 
-<details>
-<summary>Click to see the full recipe YAML</summary>
-
 ```yaml
 envs:
   MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
   # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision
   HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
-service:
-  replicas: 2
-  # An actual request for readiness probe.
-  readiness_probe:
-    path: /v1/chat/completions
-    post_data:
-      model: $MODEL_NAME
-      messages:
-        - role: user
-          content: Hello! What is your name?
-      max_tokens: 1
-
 resources:
   accelerators: { H100:8, H200:8, B100:8, B200:8, GB200:8 }
   cpus: 32+
@@ -65,9 +50,9 @@ Wait until the model is ready (this can take 10+ minutes).
 
 To curl `/v1/chat/completions`:
 ```console
-ssh -L 8081:localhost:8081 llama4
+ENDPOINT=$(sky status --endpoint 8081 llama4)
 
-curl http://localhost:8081/v1/chat/completions \
+curl http://$ENDPOINT/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@@ -83,10 +68,6 @@ curl http://localhost:8081/v1/chat/completions \
     ]
   }' | jq .
 ```
-Example outputs:
-```console
-...
-```
 
 To stop the instance:
 ```console
@@ -116,13 +97,13 @@ watch -n10 sky serve status llama4
 
 ```console
 Services
-NAME  VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
-llama4  1        35s     READY   2/2       xx.yy.zz.100:30001
+NAME   VERSION  UPTIME  STATUS  REPLICAS  ENDPOINT
+llama4 1        35s     READY   2/2       xx.yy.zz.100:30001
 
 Service Replicas
-SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                       STATUS  REGION
-llama4          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'H100': 8})  READY   us-east4
-llama4          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'H100': 8})  READY   us-east4
+SERVICE_NAME  ID  VERSION  IP            LAUNCHED     RESOURCES                  STATUS  REGION
+llama4        1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'H100': 8})  READY   us-east4
+llama4        2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'H100': 8})  READY   us-east4
 ```
 </details>
 

From e11dc185b7b7fa3a3dead00f094beeb52d09bc74 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 01:30:30 -0700
Subject: [PATCH 06/12] fix input

---
 llm/llama-4/README.md | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index cfc4d855e7a..daced3b28ea 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -118,20 +118,20 @@ ENDPOINT=$(sky serve status --endpoint llama4)
 To curl the endpoint:
 ```console
 curl http://$ENDPOINT/v1/chat/completions \
-    -H 'Content-Type: application/json' \
-    -H 'Authorization: Bearer token' \
-    --data '{
-        "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        "messages": [
-        {
-            "role": "user",
-            "content": [
-                {"type" : "text", "text": "Covert this logo to ASCII art"},
-                {"type": "image_url", "image_url": {"url": "https://pbs.twimg.com/profile_images/1584596138635632640/HWexMoH5_400x400.jpg"}}
-            ]
-        }],
-        "max_tokens": 2048
-    }' | jq .
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Who are you?"
+      }
+    ]
+  }' | jq .
 ```
 
 To shut down all resources:

From e1c18f0550c4da85d2a3ff2bc8b41a32b0a7d29a Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 01:34:04 -0700
Subject: [PATCH 07/12] Add video

---
 llm/llama-4/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index daced3b28ea..391a768d188 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -7,6 +7,7 @@
 
 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025.
 
+https://github.com/user-attachments/assets/7519d98a-f10f-4671-8c2b-a2fe36a6789d
 
 ## Prerequisites
 

From 5fec58cadad0b698239cfdf344ed45935d194bf6 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 01:41:25 -0700
Subject: [PATCH 08/12] Add video

---
 llm/llama-4/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index 391a768d188..089cc42b556 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -7,7 +7,7 @@
 
 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025.
 
-https://github.com/user-attachments/assets/7519d98a-f10f-4671-8c2b-a2fe36a6789d
+https://github.com/user-attachments/assets/4cd40fde-6418-4722-94db-372edf50eb5e
 
 ## Prerequisites
 

From 8c54cd14bb28d1fb67a498013c2e591d4afcd2b9 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 11:34:13 -0700
Subject: [PATCH 09/12] Update vllm version in README.md

---
 llm/llama-4/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index 089cc42b556..f199922132c 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -31,7 +31,7 @@ resources:
   ports: 8081  # Expose to internet traffic.
 
 setup: |
-  uv pip install vllm>0.8.2
+  uv pip install vllm==0.8.3
 
 run: |
   echo 'Starting vllm api server...'

From 326d90183230438ac80231737da272d6b5ce77cc Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 11:43:00 -0700
Subject: [PATCH 10/12] Update video

---
 llm/llama-4/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index f199922132c..a69bbd41192 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -7,7 +7,7 @@
 
 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025.
 
-https://github.com/user-attachments/assets/4cd40fde-6418-4722-94db-372edf50eb5e
+https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded
 
 ## Prerequisites
 

From a422415c3fec651bde7da353f2d91f5552364dfe Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 11:55:21 -0700
Subject: [PATCH 11/12] Update readme

---
 llm/llama-4/README.md | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/llm/llama-4/README.md b/llm/llama-4/README.md
index a69bbd41192..63d0904e900 100644
--- a/llm/llama-4/README.md
+++ b/llm/llama-4/README.md
@@ -7,7 +7,8 @@
 
 [Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) family was released by Meta on Apr 5, 2025.
 
-https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded
+![](https://i.imgur.com/kjqLX87.png)
+
 
 ## Prerequisites
 
@@ -15,12 +16,19 @@ https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded
 - Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)).
 - Check that `sky check` shows clouds or Kubernetes are enabled.
 
-## SkyPilot YAML
+## Run Llama 4
 
+```bash
+sky launch llama4.yaml -c llama4 --env HF_TOKEN
+```
+
+https://github.com/user-attachments/assets/48cdc44a-31a5-45f0-93be-7a8b6c6a0ded
+
+
+The `llama4.yaml` file is as follows:
 ```yaml
 envs:
   MODEL_NAME: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  # MODEL_NAME: meta-llama/Llama-3.2-3B-Vision
   HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
@@ -43,7 +51,11 @@ run: |
 
 ```
 
-Wait until the model is ready (this can take 10+ minutes).
+You can use other models by setting different `MODEL_NAME`.
+```bash
+sky launch llama4.yaml -c llama4 --env HF_TOKEN --env MODEL_NAME=meta-llama/Llama-4-Scout-17B-16E-Instruct
+```
+
 
 🎉 **Congratulations!** 🎉 You have now launched the Llama 4 Maverick Instruct LLM on your infra.
 

From 9de9ed23bcdb27213a938cb29b69082f1e13ea6e Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sun, 6 Apr 2025 14:10:21 -0700
Subject: [PATCH 12/12] update accelerators

---
 llm/llama-4/llama4.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/llama-4/llama4.yaml b/llm/llama-4/llama4.yaml
index 52426af0bd9..bdce6080b3a 100644
--- a/llm/llama-4/llama4.yaml
+++ b/llm/llama-4/llama4.yaml
@@ -4,7 +4,7 @@ envs:
   HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
-  accelerators: { H100:8, H100:12, H200:8, GH200:8, B100:8, B200:8, GB200:8 }
+  accelerators: { H100:8, H200:8, B100:8, B200:8, GB200:8 }
   cpus: 32+
   disk_size: 512  # Ensure model checkpoints can fit.
   disk_tier: best