From 1bc1f81f21ffb7699d3e30530eb2806c78d873b4 Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Thu, 1 Aug 2024 22:39:40 +0000 Subject: [PATCH 1/4] updating vllm docs to use llama3 405B as example --- docs/examples/vllm/README.md | 8 ++-- docs/examples/vllm/build/Dockerfile | 2 +- docs/examples/vllm/lws.yaml | 57 ++++++++++++++++++++--------- 3 files changed, 44 insertions(+), 23 deletions(-) diff --git a/docs/examples/vllm/README.md b/docs/examples/vllm/README.md index 5a9df4f2..bf65902e 100644 --- a/docs/examples/vllm/README.md +++ b/docs/examples/vllm/README.md @@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md) ## Deploy LeaderWorkerSet of vLLM -We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has two pods (tp=2). +We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 machines per pod (tensor_parallel_size=8). The leader pod runs the Ray head and the http server, while the workers run the Ray workers. ```shell @@ -67,7 +67,7 @@ Open another terminal and send a request curl http://localhost:8080/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "facebook/opt-125m", + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 @@ -80,11 +80,11 @@ The output should be similar to the following "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", "object": "text_completion", "created": 1715138766, - "model": "facebook/opt-125m", + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", "choices": [ { "index": 0, - "text": " great place to live. I", + "text": " top destination for foodies, with", "logprobs": null, "finish_reason": "length", "stop_reason": null diff --git a/docs/examples/vllm/build/Dockerfile b/docs/examples/vllm/build/Dockerfile index 34d6ecff..b292a929 100644 --- a/docs/examples/vllm/build/Dockerfile +++ b/docs/examples/vllm/build/Dockerfile @@ -1,2 +1,2 @@ -FROM docker.io/vllm/vllm-openai:v0.4.1 +FROM docker.io/vllm/vllm-openai:v0.5.3.post1 COPY ray_init.sh /vllm-workspace/ray_init.sh diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml index 9328f802..c2c9ab0b 100644 --- a/docs/examples/vllm/lws.yaml +++ b/docs/examples/vllm/lws.yaml @@ -14,49 +14,60 @@ spec: spec: containers: - name: vllm-leader - # this image is build with the Dockerfile under ./build - image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1 + image: env: - name: RAY_CLUSTER_SIZE valueFrom: fieldRef: fieldPath: metadata.annotations['leaderworkerset.sigs.k8s.io/size'] + - name: HUGGING_FACE_HUB_TOKEN + value: command: - sh - -c - "/vllm-workspace/ray_init.sh leader --ray_cluster_size=$RAY_CLUSTER_SIZE; - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model facebook/opt-125m --swap-space 2 --tensor-parallel-size 2" + python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" resources: limits: - nvidia.com/gpu: "1" + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 2600Gi + cpu: "125" requests: - cpu: "4" - memory: 8Gi - nvidia.com/gpu: "1" + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 2600Gi + cpu: "125" ports: - containerPort: 8080 - readinessProbe: - tcpSocket: - port: 8080 - initialDelaySeconds: 15 - periodSeconds: 10 + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi workerTemplate: spec: containers: - name: vllm-worker - # this image is build with the Dockerfile under ./build - image: kube-ai-registry.cn-shanghai.cr.aliyuncs.com/kube-ai/vllm:0.4.1 + image: command: - sh - -c - "/vllm-workspace/ray_init.sh worker --ray_address=$(LEADER_NAME).$(LWS_NAME).$(NAMESPACE).svc.cluster.local" resources: limits: - nvidia.com/gpu: "1" + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 2600Gi + cpu: 125 requests: - cpu: "4" - memory: 8Gi - nvidia.com/gpu: "1" + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 2600Gi + cpu: 125 env: - name: LEADER_NAME valueFrom: @@ -70,3 +81,13 @@ spec: valueFrom: fieldRef: fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/name'] + - name: HUGGING_FACE_HUB_TOKEN + value: + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi \ No newline at end of file From 67282e73ff98837761c007e6de250b24d9197af1 Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Mon, 5 Aug 2024 21:47:51 +0000 Subject: [PATCH 2/4] addressed comments --- docs/examples/vllm/README.md | 2 +- docs/examples/vllm/lws.yaml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/examples/vllm/README.md b/docs/examples/vllm/README.md index bf65902e..89696e81 100644 --- a/docs/examples/vllm/README.md +++ b/docs/examples/vllm/README.md @@ -8,7 +8,7 @@ In this example, we will use LeaderWorkerSet to deploy a distributed inference s Follow the step-by-step guide on how to install LWS. [View installation guide](https://github.com/kubernetes-sigs/lws/blob/main/docs/setup/install.md) ## Deploy LeaderWorkerSet of vLLM -We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 machines per pod (tensor_parallel_size=8). +We use LeaderWorkerSet to deploy two vLLM model replicas, and each vLLM replica has 2 pods (pipeline_parallel_size=2) and 8 GPUs per pod (tensor_parallel_size=8). The leader pod runs the Ray head and the http server, while the workers run the Ray workers. ```shell diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml index c2c9ab0b..98e70597 100644 --- a/docs/examples/vllm/lws.yaml +++ b/docs/examples/vllm/lws.yaml @@ -90,4 +90,5 @@ spec: - name: dshm emptyDir: medium: Memory - sizeLimit: 15Gi \ No newline at end of file + sizeLimit: 15Gi + \ No newline at end of file From 4df5479e3066d2b8c6d74a06ab0a213407e044be Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Mon, 5 Aug 2024 23:42:27 +0000 Subject: [PATCH 3/4] fixed resource limit and request --- docs/examples/vllm/lws.yaml | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml index 98e70597..0691d897 100644 --- a/docs/examples/vllm/lws.yaml +++ b/docs/examples/vllm/lws.yaml @@ -31,13 +31,10 @@ spec: limits: nvidia.com/gpu: "8" memory: 1124Gi - ephemeral-storage: 2600Gi - cpu: "125" + ephemeral-storage: 800Gi requests: - nvidia.com/gpu: "8" - memory: 1124Gi - ephemeral-storage: 2600Gi - cpu: "125" + ephemeral-storage: 800Gi + cpu: 125 ports: - containerPort: 8080 volumeMounts: @@ -61,12 +58,9 @@ spec: limits: nvidia.com/gpu: "8" memory: 1124Gi - ephemeral-storage: 2600Gi - cpu: 125 + ephemeral-storage: 800Gi requests: - nvidia.com/gpu: "8" - memory: 1124Gi - ephemeral-storage: 2600Gi + ephemeral-storage: 800Gi cpu: 125 env: - name: LEADER_NAME @@ -91,4 +85,3 @@ spec: emptyDir: medium: Memory sizeLimit: 15Gi - \ No newline at end of file From a3b786452e092a320a1c9e72c341f433dba6b2c1 Mon Sep 17 00:00:00 2001 From: Edwinhr716 Date: Tue, 6 Aug 2024 00:16:51 +0000 Subject: [PATCH 4/4] added readinessProbe --- docs/examples/vllm/lws.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/examples/vllm/lws.yaml b/docs/examples/vllm/lws.yaml index 0691d897..225773f9 100644 --- a/docs/examples/vllm/lws.yaml +++ b/docs/examples/vllm/lws.yaml @@ -37,6 +37,11 @@ spec: cpu: 125 ports: - containerPort: 8080 + readinessProbe: + tcpSocket: + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 volumeMounts: - mountPath: /dev/shm name: dshm