From 1bc1f81f21ffb7699d3e30530eb2806c78d873b4 Mon Sep 17 00:00:00 2001
From: Edwinhr716 <edandres249@gmail.com>
Date: Thu, 1 Aug 2024 22:39:40 +0000
Subject: [PATCH 1/4] updating vllm docs to use llama3  405B as example

---
 docs/examples/vllm/README.md        |  8 ++--
 docs/examples/vllm/build/Dockerfile |  2 +-
 docs/examples/vllm/lws.yaml         | 57 ++++++++++++++++++++---------
 3 files changed, 44 insertions(+), 23 deletions(-)
diff --git a/docs/examples/vllm/README.md b/docs/examples/vllm/README.md
index 5a9df4f2..bf65902e 100644
--- a/docs/examples/vllm/README.md
+++ b/docs/examples/vllm/README.md
@@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s
 Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md)
 
 ## Deploy LeaderWorkerSet of vLLM
-We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has two pods (tp=2). 
+We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 machines per pod (tensor_parallel_size=8). 
 The leader pod runs the Ray head and the http server, while the workers run the Ray workers.
 
 ```shell
@@ -67,7 +67,7 @@ Open another terminal and send a request
 curl http://localhost:8080/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-    "model": "facebook/opt-125m",
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
     "prompt": "San Francisco is a",
     "max_tokens": 7,
     "temperature": 0
@@ -80,11 +80,11 @@ The output should be similar to the following
   "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
   "object": "text_completion",
   "created": 1715138766,
-  "model": "facebook/opt-125m",
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
   "choices": [
     {
       "index": 0,
-      "text": " great place to live.  I",
+      "text": " top destination for foodies, with",
       "logprobs": null,
       "finish_reason": "length",
       "stop_reason": null
diff --git a/docs/examples/vllm/build/Dockerfile b/docs/examples/vllm/build/Dockerfile
index 34d6ecff..b292a929 100644
--- a/docs/examples/vllm/build/Dockerfile
+++ b/docs/examples/vllm/build/Dockerfile
@@ -1,2 +1,2 @@
-FROM docker.io/vllm/vllm-openai:v0.4.1
+FROM docker.io/vllm/vllm-openai:v0.5.3.post1
 COPY ray_init.sh /vllm-workspace/ray_init.sh
diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml
index 9328f802..c2c9ab0b 100644
--- a/docs/examples/vllm/lws.yaml
+++ b/docs/examples/vllm/lws.yaml
@@ -14,49 +14,60 @@ spec:
       spec:
         containers:
           - name: vllm-leader
-            # this image is build with the Dockerfile under ./build
-            image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
+            image: <image-built-from-dockerfile>
             env:
               - name: RAY_CLUSTER_SIZE
                 valueFrom:
                   fieldRef:
                     fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size']
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
             command:
               - sh
               - -c
               - "/vllm-workspace/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE; 
-                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --swap-space 2 --tensor-parallel-size 2"
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
             resources:
               limits:
-                nvidia.com/gpu: "1"
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 2600Gi
+                cpu: "125"
               requests:
-                cpu: "4"
-                memory: 8Gi
-                nvidia.com/gpu: "1"
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 2600Gi
+                cpu: "125"
             ports:
               - containerPort: 8080
-            readinessProbe:
-              tcpSocket:
-                port: 8080
-              initialDelaySeconds: 15
-              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
     workerTemplate:
       spec:
         containers:
           - name: vllm-worker
-            # this image is build with the Dockerfile under ./build
-            image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
+            image: <image-built-from-dockerfile>
             command:
               - sh
               - -c
               - "/vllm-workspace/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local"
             resources:
               limits:
-                nvidia.com/gpu: "1"
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 2600Gi
+                cpu: 125
               requests:
-                cpu: "4"
-                memory: 8Gi
-                nvidia.com/gpu: "1"
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 2600Gi
+                cpu: 125
             env:
               - name: LEADER_NAME
                 valueFrom:
@@ -70,3 +81,13 @@ spec:
                 valueFrom:
                   fieldRef:
                     fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name']
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
\ No newline at end of file

From 67282e73ff98837761c007e6de250b24d9197af1 Mon Sep 17 00:00:00 2001
From: Edwinhr716 <edandres249@gmail.com>
Date: Mon, 5 Aug 2024 21:47:51 +0000
Subject: [PATCH 2/4] addressed comments

---
 docs/examples/vllm/README.md | 2 +-
 docs/examples/vllm/lws.yaml  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/examples/vllm/README.md b/docs/examples/vllm/README.md
index bf65902e..89696e81 100644
--- a/docs/examples/vllm/README.md
+++ b/docs/examples/vllm/README.md
@@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s
 Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md)
 
 ## Deploy LeaderWorkerSet of vLLM
-We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 machines per pod (tensor_parallel_size=8). 
+We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 GPUs per pod (tensor_parallel_size=8). 
 The leader pod runs the Ray head and the http server, while the workers run the Ray workers.
 
 ```shell
diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml
index c2c9ab0b..98e70597 100644
--- a/docs/examples/vllm/lws.yaml
+++ b/docs/examples/vllm/lws.yaml
@@ -90,4 +90,5 @@ spec:
         - name: dshm
           emptyDir:
             medium: Memory
-            sizeLimit: 15Gi
\ No newline at end of file
+            sizeLimit: 15Gi
+            
\ No newline at end of file

From 4df5479e3066d2b8c6d74a06ab0a213407e044be Mon Sep 17 00:00:00 2001
From: Edwinhr716 <edandres249@gmail.com>
Date: Mon, 5 Aug 2024 23:42:27 +0000
Subject: [PATCH 3/4] fixed resource limit and request

---
 docs/examples/vllm/lws.yaml | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml
index 98e70597..0691d897 100644
--- a/docs/examples/vllm/lws.yaml
+++ b/docs/examples/vllm/lws.yaml
@@ -31,13 +31,10 @@ spec:
               limits:
                 nvidia.com/gpu: "8"
                 memory: 1124Gi
-                ephemeral-storage: 2600Gi
-                cpu: "125"
+                ephemeral-storage: 800Gi
               requests:
-                nvidia.com/gpu: "8"
-                memory: 1124Gi
-                ephemeral-storage: 2600Gi
-                cpu: "125"
+                ephemeral-storage: 800Gi
+                cpu: 125
             ports:
               - containerPort: 8080
             volumeMounts:
@@ -61,12 +58,9 @@ spec:
               limits:
                 nvidia.com/gpu: "8"
                 memory: 1124Gi
-                ephemeral-storage: 2600Gi
-                cpu: 125
+                ephemeral-storage: 800Gi
               requests:
-                nvidia.com/gpu: "8"
-                memory: 1124Gi
-                ephemeral-storage: 2600Gi
+                ephemeral-storage: 800Gi
                 cpu: 125
             env:
               - name: LEADER_NAME
@@ -91,4 +85,3 @@ spec:
           emptyDir:
             medium: Memory
             sizeLimit: 15Gi
-            
\ No newline at end of file

From a3b786452e092a320a1c9e72c341f433dba6b2c1 Mon Sep 17 00:00:00 2001
From: Edwinhr716 <edandres249@gmail.com>
Date: Tue, 6 Aug 2024 00:16:51 +0000
Subject: [PATCH 4/4] added readinessProbe

---
 docs/examples/vllm/lws.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml
index 0691d897..225773f9 100644
--- a/docs/examples/vllm/lws.yaml
+++ b/docs/examples/vllm/lws.yaml
@@ -37,6 +37,11 @@ spec:
                 cpu: 125
             ports:
               - containerPort: 8080
+            readinessProbe:
+              tcpSocket:
+                port: 8080
+              initialDelaySeconds: 15
+              periodSeconds: 10
             volumeMounts:
               - mountPath: /dev/shm
                 name: dshm