kubernetes-sigs · k8s-ci-robot · Aug 6, 2024 · Aug 1, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/docs/examples/vllm/README.md b/docs/examples/vllm/README.md
@@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s
 Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md)
 
 ## Deploy LeaderWorkerSet of vLLM
-We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has two pods (tp=2). 
+We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 GPUs per pod (tensor_parallel_size=8). 
 The leader pod runs the Ray head and the http server, while the workers run the Ray workers.
 
 ```shell
@@ -67,7 +67,7 @@ Open another terminal and send a request
 curl http://localhost:8080/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-    "model": "facebook/opt-125m",
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
     "prompt": "San Francisco is a",
     "max_tokens": 7,
     "temperature": 0
@@ -80,11 +80,11 @@ The output should be similar to the following
   "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
   "object": "text_completion",
   "created": 1715138766,
-  "model": "facebook/opt-125m",
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
   "choices": [
     {
       "index": 0,
-      "text": " great place to live.  I",
+      "text": " top destination for foodies, with",
       "logprobs": null,
       "finish_reason": "length",
       "stop_reason": null

diff --git a/docs/examples/vllm/build/Dockerfile b/docs/examples/vllm/build/Dockerfile
@@ -1,2 +1,2 @@
-FROM docker.io/vllm/vllm-openai:v0.4.1
+FROM docker.io/vllm/vllm-openai:v0.5.3.post1
 COPY ray_init.sh /vllm-workspace/ray_init.sh
diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml
@@ -14,49 +14,59 @@ spec:
       spec:
         containers:
           - name: vllm-leader
-            # this image is build with the Dockerfile under ./build
-            image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
+            image: <image-built-from-dockerfile>
             env:
               - name: RAY_CLUSTER_SIZE
                 valueFrom:
                   fieldRef:
                     fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size']
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
             command:
               - sh
               - -c
               - "/vllm-workspace/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE; 
-                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --swap-space 2 --tensor-parallel-size 2"
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
             resources:
               limits:
-                nvidia.com/gpu: "1"
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
               requests:
-                cpu: "4"
-                memory: 8Gi
-                nvidia.com/gpu: "1"
+                ephemeral-storage: 800Gi
+                cpu: 125
             ports:
               - containerPort: 8080
             readinessProbe:
               tcpSocket:
                 port: 8080
               initialDelaySeconds: 15
               periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
     workerTemplate:
       spec:
         containers:
           - name: vllm-worker
-            # this image is build with the Dockerfile under ./build
-            image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1
+            image: <image-built-from-dockerfile>
             command:
               - sh
               - -c
               - "/vllm-workspace/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local"
             resources:
               limits:
-                nvidia.com/gpu: "1"
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
               requests:
-                cpu: "4"
-                memory: 8Gi
-                nvidia.com/gpu: "1"
+                ephemeral-storage: 800Gi
+                cpu: 125
             env:
               - name: LEADER_NAME
                 valueFrom:
@@ -70,3 +80,13 @@ spec:
                 valueFrom:
                   fieldRef:
                     fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name']
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi