diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 7459c508f..fd2b2d800 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -1,4 +1,4 @@ -{{- if .Values.servingEngineSpec.enableEngine -}} +{{- if and .Values.servingEngineSpec.enableEngine (not (hasKey .Values.servingEngineSpec "raySpec")) -}} {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }} {{- $kv_role := "kv_both" }} {{- $kv_rank := 0 }} diff --git a/helm/templates/ray-cluster.yaml b/helm/templates/ray-cluster.yaml new file mode 100644 index 000000000..3a1454c30 --- /dev/null +++ b/helm/templates/ray-cluster.yaml @@ -0,0 +1,620 @@ +{{- if and .Values.servingEngineSpec.enableEngine (hasKey .Values.servingEngineSpec "raySpec")}} +{{- range $modelSpec := .Values.servingEngineSpec.modelSpec }} +{{- with $ -}} +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: "{{ .Release.Name }}-{{$modelSpec.name}}-raycluster" + namespace: {{ .Release.Namespace }} + labels: + model: {{ $modelSpec.name }} + helm-release-name: {{ .Release.Name }} + {{- include "chart.engineLabels" . | nindent 4 }} +spec: + headGroupSpec: + serviceType: ClusterIP + rayStartParams: + dashboard-host: "0.0.0.0" + template: + metadata: + labels: + model: {{ $modelSpec.name }} + helm-release-name: {{ .Release.Name }} + {{- include "chart.engineLabels" . | nindent 10 }} + spec: + terminationGracePeriodSeconds: 0 + {{- if .Values.servingEngineSpec.securityContext }} + securityContext: + {{- toYaml .Values.servingEngineSpec.securityContext | nindent 10 }} + {{- end }} + containers: + - name: vllm-ray-head + image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}" + command: + - >- + /bin/bash -c " + cp /entrypoint/vllm-entrypoint.sh \$HOME/vllm-entrypoint.sh && + chmod +x \$HOME/vllm-entrypoint.sh && + \$HOME/vllm-entrypoint.sh & + echo \"Running vllm command in the background.\"" + env: + - name: VLLM_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: EXPECTED_NODES + value: "{{ add $modelSpec.replicaCount 1}}" + - name: HF_HOME + {{- if hasKey $modelSpec "pvcStorage" }} + value: /data + {{- else }} + value: /tmp + {{- end }} + {{- with $modelSpec.vllmConfig}} + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" + {{- if hasKey . "v1" }} + - name: VLLM_USE_V1 + value: {{ $modelSpec.vllmConfig.v1 | quote }} + {{- else }} + - name: VLLM_USE_V1 + value: "0" + {{- end}} + {{- end}} + {{- if $modelSpec.hf_token }} + - name: HF_TOKEN + {{- if kindIs "string" $modelSpec.hf_token }} + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-secrets + key: hf_token_{{ $modelSpec.name }} + {{- else }} + valueFrom: + secretKeyRef: + name: {{ $modelSpec.hf_token.secretName }} + key: {{ $modelSpec.hf_token.secretKey }} + {{- end }} + {{- end }} + {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }} + {{- if $vllmApiKey }} + - name: VLLM_API_KEY + {{- if kindIs "string" $vllmApiKey }} + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-secrets + key: vllmApiKey + {{- else }} + valueFrom: + secretKeyRef: + name: {{ $vllmApiKey.secretName }} + key: {{ $vllmApiKey.secretKey }} + {{- end }} + {{- end }} + {{- with $modelSpec.env }} + {{- toYaml . | nindent 10 }} + {{- end }} + {{- if $modelSpec.lmcacheConfig }} + {{- if $modelSpec.lmcacheConfig.enabled }} + - name: LMCACHE_USE_EXPERIMENTAL + value: "True" + - name: VLLM_RPC_TIMEOUT + value: "1000000" + {{- end }} + {{- if $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }} + - name: LMCACHE_LOCAL_CPU + value: "True" + - name: LMCACHE_MAX_LOCAL_CPU_SIZE + value: "{{ $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}" + {{- end }} + {{- if $modelSpec.lmcacheConfig.diskOffloadingBufferSize }} + - name: LMCACHE_LOCAL_DISK + value: "True" + - name: LMCACHE_MAX_LOCAL_DISK_SIZE + value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}" + {{- end }} + {{- if .Values.cacheserverSpec }} + - name: LMCACHE_REMOTE_URL + value: "{{ include "cacheserver.formatRemoteUrl" (dict "service_name" (print .Release.Name "-cache-server-service") "port" .Values.cacheserverSpec.servicePort) }}" + - name: LMCACHE_REMOTE_SERDE + value: "{{ .Values.cacheserverSpec.serde }}" + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "enableController" }} + - name: LMCACHE_ENABLE_CONTROLLER + value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.enableController | quote }} + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "instanceId" }} + - name: LMCACHE_INSTANCE_ID + value: {{ $modelSpec.lmcacheConfig.instanceId | quote }} + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "controllerPort" }} + - name: LMCACHE_CONTROLLER_URL + value: "{{ .Release.Name }}-{{$modelSpec.name}}-service:{{ $modelSpec.lmcacheConfig.controllerPort }}" + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "workerPort" }} + - name: LMCACHE_WORKER_PORT + value: "{{ .Release.Name }}-service:{{ $modelSpec.lmcacheConfig.workerPort }}" + {{- end }} + {{- end }} + {{- if .Values.servingEngineSpec.configs }} + envFrom: + - configMapRef: + name: "{{ .Release.Name }}-configs" + {{- end }} + ports: + - name: {{ include "chart.container-port-name" . }} + containerPort: {{ include "chart.container-port" . }} + readinessProbe: + httpGet: + path: /health + port: {{ include "chart.container-port" . }} + failureThreshold: 1 + periodSeconds: 10 + livenessProbe: + exec: + command: ["/bin/bash", "-c", "echo TBD"] + resources: + limits: + cpu: {{ default "2" .Values.servingEngineSpec.raySpec.headNode.requestCPU }} + memory: {{ default "8Gi" .Values.servingEngineSpec.raySpec.headNode.requestMemory }} + {{- if hasKey .Values.servingEngineSpec.raySpec.headNode "requestGPU" }} + nvidia.com/gpu: {{ .Values.servingEngineSpec.raySpec.headNode.requestGPU }} + {{- end }} + startupProbe: + exec: + command: ["/bin/bash", "-c", "python3 /scripts/wait_for_ray.py"] + failureThreshold: 30 + periodSeconds: 15 + timeoutSeconds: 10 + volumeMounts: + - name: wait-script + mountPath: /scripts + - name: vllm-script + mountPath: /entrypoint + {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumeMounts") }} + {{- if hasKey $modelSpec "pvcStorage" }} + - name: {{ .Release.Name }}-storage + mountPath: /data + {{- end }} + {{- with $modelSpec.vllmConfig }} + {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} + - name: shm + mountPath: /dev/shm + {{- end}} + {{- end}} + {{- if $modelSpec.chatTemplate }} + - name: vllm-templates + mountPath: /templates + {{- end }} + {{- if hasKey $modelSpec "extraVolumeMounts" }} + {{- toYaml $modelSpec.extraVolumeMounts | nindent 14 }} + {{- end }} + {{- end }} + volumes: + - name: wait-script + configMap: + name: wait-for-ray-script + - name: vllm-script + configMap: + name: vllm-start-script + {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumes") }} + {{- if hasKey $modelSpec "pvcStorage" }} + - name: {{ .Release.Name }}-storage + persistentVolumeClaim: + claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim" + {{- end }} + {{- with $modelSpec.vllmConfig }} + {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ default "20Gi" $modelSpec.shmSize }} + {{- end}} + {{- end}} + {{- if $modelSpec.chatTemplate}} + {{- if hasKey $modelSpec "chatTemplateConfigMap" }} + - name: {{ .Release.Name }}-chat-templates + configMap: + name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates" + {{- else }} + - name: vllm-templates + persistentVolumeClaim: + claimName: vllm-templates-pvc + {{- end }} + {{- end}} + {{- if hasKey $modelSpec "extraVolumes" }} + {{- toYaml $modelSpec.extraVolumes | nindent 8 }} + {{- end}} + {{- end}} + {{- if $modelSpec.imagePullSecret }} + imagePullSecrets: + - name: {{ $modelSpec.imagePullSecret }} + {{- end }} + {{- if .Values.servingEngineSpec.tolerations }} + {{- with .Values.servingEngineSpec.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + {{- if .Values.servingEngineSpec.runtimeClassName }} + runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }} + {{- end }} + {{- if .Values.servingEngineSpec.schedulerName }} + schedulerName: {{ .Values.servingEngineSpec.schedulerName }} + {{- end }} + {{- if $modelSpec.nodeName }} + nodeName: {{ $modelSpec.nodeName }} + {{- else if $modelSpec.nodeSelectorTerms}} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + {{- with $modelSpec.nodeSelectorTerms }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} + workerGroupSpecs: + - rayStartParams: {} + replicas: {{ $modelSpec.replicaCount }} + groupName: ray + template: + metadata: + labels: + model: {{ $modelSpec.name }} + helm-release-name: {{ .Release.Name }} + {{- if .Values.servingEngineSpec.securityContext }} + securityContext: + {{- toYaml .Values.servingEngineSpec.securityContext | nindent 8 }} + {{- end }} + spec: + containers: + - name: vllm-ray-worker + image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}" + env: + - name: VLLM_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: HF_HOME + {{- if hasKey $modelSpec "pvcStorage" }} + value: /data + {{- else }} + value: /tmp + {{- end }} + {{- with $modelSpec.vllmConfig}} + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" + {{- if hasKey . "v1" }} + - name: VLLM_USE_V1 + value: {{ $modelSpec.vllmConfig.v1 | quote }} + {{- else }} + - name: VLLM_USE_V1 + value: "0" + {{- end}} + {{- end}} + {{- if $modelSpec.hf_token }} + - name: HF_TOKEN + {{- if kindIs "string" $modelSpec.hf_token }} + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-secrets + key: hf_token_{{ $modelSpec.name }} + {{- else }} + valueFrom: + secretKeyRef: + name: {{ $modelSpec.hf_token.secretName }} + key: {{ $modelSpec.hf_token.secretKey }} + {{- end }} + {{- end }} + {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }} + {{- if $vllmApiKey }} + - name: VLLM_API_KEY + {{- if kindIs "string" $vllmApiKey }} + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-secrets + key: vllmApiKey + {{- else }} + valueFrom: + secretKeyRef: + name: {{ $vllmApiKey.secretName }} + key: {{ $vllmApiKey.secretKey }} + {{- end }} + {{- end }} + {{- with $modelSpec.env }} + {{- toYaml . | nindent 10 }} + {{- end }} + {{- if $modelSpec.lmcacheConfig }} + {{- if $modelSpec.lmcacheConfig.enabled }} + - name: LMCACHE_USE_EXPERIMENTAL + value: "True" + - name: VLLM_RPC_TIMEOUT + value: "1000000" + {{- end }} + {{- if $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }} + - name: LMCACHE_LOCAL_CPU + value: "True" + - name: LMCACHE_MAX_LOCAL_CPU_SIZE + value: "{{ $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}" + {{- end }} + {{- if $modelSpec.lmcacheConfig.diskOffloadingBufferSize }} + - name: LMCACHE_LOCAL_DISK + value: "True" + - name: LMCACHE_MAX_LOCAL_DISK_SIZE + value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}" + {{- end }} + {{- if .Values.cacheserverSpec }} + - name: LMCACHE_REMOTE_URL + value: "{{ include "cacheserver.formatRemoteUrl" (dict "service_name" (print .Release.Name "-cache-server-service") "port" .Values.cacheserverSpec.servicePort) }}" + - name: LMCACHE_REMOTE_SERDE + value: "{{ .Values.cacheserverSpec.serde }}" + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "enableController" }} + - name: LMCACHE_ENABLE_CONTROLLER + value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.enableController | quote }} + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "instanceId" }} + - name: LMCACHE_INSTANCE_ID + value: {{ $modelSpec.lmcacheConfig.instanceId | quote }} + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "controllerPort" }} + - name: LMCACHE_CONTROLLER_URL + value: "{{ .Release.Name }}-{{$modelSpec.name}}-service:{{ $modelSpec.lmcacheConfig.controllerPort }}" + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "workerPort" }} + - name: LMCACHE_WORKER_PORT + value: "{{ .Release.Name }}-service:{{ $modelSpec.lmcacheConfig.workerPort }}" + {{- end }} + {{- end }} + {{- if .Values.servingEngineSpec.configs }} + envFrom: + - configMapRef: + name: "{{ .Release.Name }}-configs" + {{- end }} + readinessProbe: + exec: + command: ["/bin/bash", "-c", "echo TBD"] + livenessProbe: + exec: + command: ["/bin/bash", "-c", "echo TBD"] + resources: {{- include "chart.resources" $modelSpec | nindent 16 }} + {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumeMounts") }} + volumeMounts: + {{- end }} + {{- if hasKey $modelSpec "pvcStorage" }} + - name: {{ .Release.Name }}-storage + mountPath: /data + {{- end }} + {{- with $modelSpec.vllmConfig }} + {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} + - name: shm + mountPath: /dev/shm + {{- end}} + {{- end}} + {{- if $modelSpec.chatTemplate }} + - name: vllm-templates + mountPath: /templates + {{- end }} + {{- if hasKey $modelSpec "extraVolumeMounts" }} + {{- toYaml $modelSpec.extraVolumeMounts | nindent 14 }} + {{- end }} + volumes: + - name: wait-script + configMap: + name: wait-for-ray-script + - name: vllm-script + configMap: + name: vllm-start-script + {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumes") }} + {{- if hasKey $modelSpec "pvcStorage" }} + - name: {{ .Release.Name }}-storage + persistentVolumeClaim: + claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim" + {{- end }} + {{- with $modelSpec.vllmConfig }} + {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ default "20Gi" $modelSpec.shmSize }} + {{- end}} + {{- end}} + {{- if $modelSpec.chatTemplate}} + {{- if hasKey $modelSpec "chatTemplateConfigMap" }} + - name: {{ .Release.Name }}-chat-templates + configMap: + name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates" + {{- else }} + - name: vllm-templates + persistentVolumeClaim: + claimName: vllm-templates-pvc + {{- end }} + {{- end}} + {{- if hasKey $modelSpec "extraVolumes" }} + {{- toYaml $modelSpec.extraVolumes | nindent 8 }} + {{- end}} + {{- end}} + {{- if $modelSpec.imagePullSecret }} + imagePullSecrets: + - name: {{ $modelSpec.imagePullSecret }} + {{- end }} + {{- if .Values.servingEngineSpec.tolerations }} + {{- with .Values.servingEngineSpec.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + + {{- if .Values.servingEngineSpec.runtimeClassName }} + runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }} + {{- end }} + {{- if .Values.servingEngineSpec.schedulerName }} + schedulerName: {{ .Values.servingEngineSpec.schedulerName }} + {{- end }} + {{- if $modelSpec.nodeName }} + nodeName: {{ $modelSpec.nodeName }} + {{- else if $modelSpec.nodeSelectorTerms}} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + {{- with $modelSpec.nodeSelectorTerms }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: wait-for-ray-script +data: + wait_for_ray.py: | + import ray + import logging + import os + import sys + + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s') + + try: + ray.init(address="auto") + except Exception as e: + logging.error(f"Failed to initialize Ray: {e}") + sys.exit(1) + + expected_nodes = int(os.environ.get("EXPECTED_NODES", "1")) + + alive_nodes = [n for n in ray.nodes() if n["Alive"]] + alive_count = len(alive_nodes) + + logging.info(f"Ray cluster status: {alive_count}/{expected_nodes} nodes alive.") + + if alive_count == expected_nodes: + logging.info("Cluster is ready.") + sys.exit(0) + else: + logging.info("Cluster is NOT ready.") + sys.exit(1) +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-start-script +data: + vllm-entrypoint.sh: | + #!/bin/bash + set -e + + echo "Waiting for Ray to become available..." + until python3 /scripts/wait_for_ray.py; do + echo "Ray not ready yet. Retrying in 2 seconds..." + sleep 2 + done + + echo "Ray is ready. Starting vLLM..." + + # Start constructing command + ARGS=( + "vllm" + "serve" + "{{ $modelSpec.modelURL | quote }}" + "--host" "0.0.0.0" + "--port" "{{ include "chart.container-port" . }}" + "--distributed-executor-backend" "ray" + ) + + {{- if $modelSpec.enableLoRA }} + ARGS+=("--enable-lora") + {{- end }} + + {{- if $modelSpec.enableTool }} + ARGS+=("--enable-auto-tool-choice") + {{- end }} + + {{- if $modelSpec.toolCallParser }} + ARGS+=("--tool-call-parser" {{ $modelSpec.toolCallParser | quote }}) + {{- end }} + + {{- with $modelSpec.vllmConfig }} + {{- if hasKey . "enableChunkedPrefill" }} + {{- if .enableChunkedPrefill }} + ARGS+=("--enable-chunked-prefill") + {{- else }} + ARGS+=("--no-enable-chunked-prefill") + {{- end }} + {{- end }} + + {{- if .enablePrefixCaching }} + ARGS+=("--enable-prefix-caching") + {{- end }} + + {{- if hasKey . "maxModelLen" }} + ARGS+=("--max-model-len" {{ .maxModelLen | quote }}) + {{- end }} + + {{- if hasKey . "dtype" }} + ARGS+=("--dtype" {{ .dtype | quote }}) + {{- end }} + + {{- if hasKey . "tensorParallelSize" }} + ARGS+=("--tensor-parallel-size" {{ .tensorParallelSize | quote }}) + {{- end }} + + {{- if hasKey . "pipelineParallelSize" }} + ARGS+=("--pipeline-parallel-size" {{ .pipelineParallelSize | quote }}) + {{- end }} + + {{- if hasKey . "maxNumSeqs" }} + ARGS+=("--max-num-seqs" {{ .maxNumSeqs | quote }}) + {{- end }} + + {{- if hasKey . "gpuMemoryUtilization" }} + ARGS+=("--gpu-memory-utilization" {{ .gpuMemoryUtilization | quote }}) + {{- end }} + + {{- if hasKey . "maxLoras" }} + ARGS+=("--max-loras" {{ .maxLoras | quote }}) + {{- end }} + + {{- range .extraArgs }} + ARGS+=({{ . | quote }}) + {{- end }} + {{- end }} + + {{- if $modelSpec.lmcacheConfig }} + {{- if $modelSpec.lmcacheConfig.enabled }} + {{- if hasKey $modelSpec.vllmConfig "v1" }} + {{- if eq (toString $modelSpec.vllmConfig.v1) "1" }} + ARGS+=("--kv-transfer-config" "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}") + {{- else }} + ARGS+=("--kv-transfer-config" "{\"kv_connector\":\"LMCacheConnector\",\"kv_role\":\"kv_both\"}") + {{- end }} + {{- else }} + ARGS+=("--kv-transfer-config" "{\"kv_connector\":\"LMCacheConnector\",\"kv_role\":\"kv_both\"}") + {{- end }} + {{- end }} + {{- end }} + + {{- if $modelSpec.chatTemplate }} + ARGS+=("--chat-template" {{ $modelSpec.chatTemplate | quote }}) + {{- end }} + + echo "Executing: ${ARGS[@]}" + exec "${ARGS[@]}" + + +{{- if and $modelSpec.chatTemplate (hasKey $modelSpec "chatTemplateConfigMap") }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates" + namespace: "{{ .Release.Namespace }}" +data: + {{ $modelSpec.chatTemplate }}: |- + {{ $modelSpec.chatTemplateConfigMap }} +{{- end }} +{{- end }} +--- +{{- end }} +{{- end }} diff --git a/tutorials/00-a-install-multinode-kubernetes-env.md b/tutorials/00-a-install-multinode-kubernetes-env.md new file mode 100644 index 000000000..2f7dae565 --- /dev/null +++ b/tutorials/00-a-install-multinode-kubernetes-env.md @@ -0,0 +1,411 @@ +# Tutorial: Setting Up a Kubernetes Environment with GPUs on Your GPU Server + +## Introduction + +This tutorial provides a comprehensive guide to setting up a Kubernetes environment across multiple GPU-enabled servers. It covers the installation and configuration of `kubeadm`, `kubectl`, and `helm`, with a focus on ensuring GPU compatibility for workloads that require accelerated computing. By the end of this tutorial, you will have a fully operational multi-node Kubernetes cluster prepared for deploying the vLLM Production Stack. + +## Table of Contents + +- [Introduction](#introduction) +- [Table of Contents](#table-of-contents) +- [Prerequisites](#prerequisites) +- [Steps](#steps) + - [Step 1: Installing kubeadm on each node](#step-1-installing-kubeadm-on-each-node) + - [Step 2: Installing container runtime on each node](#step-2-installing-container-runtime-on-each-node) + - [Step 3: Setting up a control plane node](#step-3-setting-up-a-control-plane-node) + - [Step 4: Setting and joining a worker node](#step-4-setting-and-joining-a-worker-node) + - [Step 5: Installing container network interface](#step-5-installing-container-network-interface) + - [Step 6: Installing nvidia device plugin](#step-6-installing-nvidia-device-plugin) + +## Prerequisites + +Before you begin, ensure the following: + +1. **GPU Server Requirements:** + - A server with a GPU and drivers properly installed (e.g., NVIDIA drivers). + - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed for GPU workloads. + +2. **Access and Permissions:** + - Root or administrative access to the server. + - Internet connectivity to download required packages and tools. + +3. **Environment Setup:** + - A Linux-based operating system (e.g., Ubuntu 20.04 or later). + - Basic understanding of Linux shell commands. + +4. **Tested Environment:** + - This guide was tested on a Debian 11 (Bullseye) operating system with 24 CPUs, 100 GiB of RAM, and 300 GiB of disk space. Please note that certain configurations or settings may vary or not function as expected on different systems, depending on your specific environment. + +## Steps + +### Step 1: Installing kubeadm on each node + +1. Access to a bare-metal server that will serve as the control plane node. + +2. Clone the repository and navigate to the [`utils/`](../utils/) folder: + + ```bash + git clone https://github.com/vllm-project/production-stack.git + cd production-stack/utils + ``` + +3. Execute the script [`install-kubeadm.sh`](../utils/install-kubeadm.sh): + + ```bash + bash install-kubeadm.sh + ``` + +4. **Expected Output:** + - Confirmation that `kubeadm` was downloaded and installed. + - Verification message using: + + ```bash + kubeadm version + ``` + + Example output: + + ```plaintext + kubeadm version: &version.Info{Major:"1", Minor:"32", GitVersion:"v1.32.4", GitCommit:"59526cd4867447956156ae3a602fcbac10a2c335", GitTreeState:"clean", BuildDate:"2025-04-22T16:02:27Z", GoVersion:"go1.23.6", Compiler:"gc", Platform:"linux/amd64"} + ``` + +5. **Explanation:** + This script downloads version 1.32 of [`kubeadm`](https://v1-32.docs.kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/), the Kubernetes command-line tool for cluster management, along with kubectl and kubelet, on the current node. + +6. Repeat steps 1 to 3 on your other bare-metal server, which will serve as a worker node. + +### Step 2: Installing container runtime on each node + +1. Access to a bare-metal server that will serve as the control plane node. + +2. Execute the script [`install-cri-o.sh`](../utils/install-helm.sh): + + ```bash + bash install-cri-o.sh + ``` + +3. **Expected Output:** + - Successful installation of cri-o runtime. + - Verification message using: + + ```bash + sudo systemctl status crio + ``` + + Example output: + + ```plaintext + ● crio.service - Container Runtime Interface for OCI (CRI-O) + Loaded: loaded (/lib/systemd/system/crio.service; enabled; vendor preset: enabled) + Active: active (running) since Fri 2025-05-16 16:32:31 UTC; 20h ago + Docs: https://github.com/cri-o/cri-o + Main PID: 2332175 (crio) + Tasks: 61 + Memory: 14.4G + CPU: 17min 55.486s + CGroup: /system.slice/crio.service + ``` + +4. **Explanation:** + - Downloads, installs and configures v1.32 version of cri-o container runtime for your Kubernetes cluster. + +5. **Explanation:** + This script downloads v1.32 version of [`cri-0`](https://github.com/cri-o/packaging/blob/main/README.md#distributions-using-deb-packages), one of container runtimes for Kubernetes for managing pods on your cluster. + +6. Repeat steps 1 to 2 on your other bare-metal server, which will serve as a worker node. + +### Step 3: Setting up a control plane node + +1. Access to a bare-metal server that will serve as the control plane node. + +2. Execute the following command and wait for it to complete: + + ```bash + # Look for a line starting with "default via" + # For example: default via 10.128.0.1 dev ens5 + ip route show + + # Or get your network interface's ip address using the following command: + export K8S_NET_IP=$(ip addr show dev $(ip route show | awk '/^default/ {print $5}') | awk '/inet / {print $2}' | cut -d/ -f1) + echo "K8S_NET_IP=${K8S_NET_IP}" + + # On one of the nodes designated to become a control plane node, execute the following command: + sudo kubeadm init \ + --cri-socket=unix:///var/run/crio/crio.sock \ + --apiserver-advertise-address=${K8S_NET_IP} \ + --pod-network-cidr=192.168.0.0/16 + ``` + + Example output: + + ```plaintext + # Your Kubernetes control-plane has initialized successfully! + + # To start using your cluster, you need to run the following as a regular user: + + # mkdir -p $HOME/.kube + # sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + # sudo chown $(id -u):$(id -g) $HOME/.kube/config + + # Alternatively, if you are the root user, you can run: + + # export KUBECONFIG=/etc/kubernetes/admin.conf + + # You should now deploy a pod network to the cluster. + # Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at: + # https://kubernetes.io/docs/concepts/cluster-administration/addons/ + + # Then you can join any number of worker nodes by running the following on each as root: + + # kubeadm join --token \ + # --discovery-token-ca-cert-hash + ``` + + Perform following command to set your kube config: + + ```bash + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config + ``` + + If your control plane node is equipped with GPUs and you want GPU-enabled pods to be scheduled on it, you must remove the default taint from the node: + + ```bash + kubectl taint node instance-20250503-060921 node-role.kubernetes.io/control-plane- + ``` + +3. **Expected Output:** + - Successful initialization of control plane node. + - Verification message using: + + ```bash + kubectl get nodes -o wide + ``` + + Example output: + + ```plaintext + NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME + instance-20250503-060921 Ready control-plane 20h v1.32.4 10.xxx.x.xx Debian GNU/Linux 11 (bullseye) 5.10.0-33-cloud-amd64 cri-o://1.32.4 + ``` + + Refer to [`official kubeadm documentation`](https://v1-32.docs.kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm/) for more information. + +### Step 4: Setting and joining a worker node + +1. Access to a bare-metal server that will serve as the worker node. + +2. Execute the following command and wait for it to complete: + + ```bash + # You got following output from previous control node initialization: + + # -------------------------------------------------------------------------------- + # Your Kubernetes control-plane has initialized successfully! + # + # ... + # + # Then you can join any number of worker nodes by running the following on each as root: + # + # kubeadm join --token \ + # --discovery-token-ca-cert-hash sha256: + # -------------------------------------------------------------------------------- + + # Execute the following command on your worker node: + sudo kubeadm join :6443 --token \ + --discovery-token-ca-cert-hash sha256: \ + --cri-socket=unix:///var/run/crio/crio.sock + ``` + + If you lost above information, you can get the token and hash by running following command on your CONTROL PLANE node:: + + ```bash + # To get : + export K8S_NET_IP=$(ip addr show dev $(ip route show | awk '/^default/ {print $5}') | awk '/inet / {print $2}' | cut -d/ -f1) + echo "K8S_NET_IP=${K8S_NET_IP}" + + # To get : + sudo kubeadm token create + + # To get : + openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | \ + openssl rsa -pubin -outform der 2>/dev/null | \ + sha256sum | awk '{print $1}' + ``` + + Example output: + + ```plaintext + sudo kubeadm join :6443 --token --discovery-token-ca-cert-hash sha256: --cri-socket=unix:///var/run/crio/crio.sock + [preflight] Running pre-flight checks + [preflight] Reading configuration from the "kubeadm-config" ConfigMap in namespace "kube-system"... + [preflight] Use 'kubeadm init phase upload-config --config your-config.yaml' to re-upload it. + [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml" + [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env" + [kubelet-start] Starting the kubelet + [kubelet-check] Waiting for a healthy kubelet at http://127.0.0.1:10248/healthz. This can take up to 4m0s + [kubelet-check] The kubelet is healthy after 500.795239ms + [kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap + + This node has joined the cluster: + * Certificate signing request was sent to apiserver and a response was received. + * The Kubelet was informed of the new secure connection details. + + Run 'kubectl get nodes' on the control-plane to see this node join the cluster. + ``` + + Copy kube config file from your control plane node to current worker node (with ssh or scp): + + ```bash + mkdir -p $HOME/.kube + scp YOUR_SSH_ACCOUNT:$HOME/.kube/config $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config + ``` + +3. **Expected Output:** + - Successful initialization of worker node. + - Verification message using: + + ```bash + kubectl get nodes -o wide + ``` + + Example output: + + ```plaintext + NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME + instance-20250503-060921 Ready control-plane 20h v1.32.4 10.xxx.x.xxx Debian GNU/Linux 11 (bullseye) 5.10.0-33-cloud-amd64 cri-o://1.32.4 + insudevmachine Ready 14m v1.32.4 10.yyy.y.yyy Debian GNU/Linux 11 (bullseye) 5.10.0-33-cloud-amd64 cri-o://1.32.4 + ``` + + Refer to [`official kubeadm documentation`](https://kubernetes.io/docs/reference/setup-tools/kubeadm/kubeadm-join/) for more information. + +### Step 5: Installing container network interface + +1. Access to a bare-metal server that will serve as the control plane node. + +2. Clone the repository and navigate to the [`utils/`](../utils/) folder: + + ```bash + git clone https://github.com/vllm-project/production-stack.git + cd production-stack/utils + ``` + +3. Execute the script [`install-calico.sh`](../utils/install-calico.sh): + + ```bash + bash install-calico.sh + ``` + +4. **Expected Output:** + - Confirmation that the `Tigera` operator and its associated custom resources have been successfully installed. + - Verification message using: + + ```bash + kubectl get pods -o wide + ``` + + Example output: + + ```plaintext + NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS + GATES + calico-apiserver calico-apiserver-cccf4bb9f-8lbc7 1/1 Running 0 21h 192.168.190.7 instance-20250503-060921 + calico-apiserver calico-apiserver-cccf4bb9f-knn9c 1/1 Running 0 21h 192.168.190.4 instance-20250503-060921 + calico-system calico-kube-controllers-56dfdbb787-c24gd 1/1 Running 0 21h 192.168.190.2 instance-20250503-060921 + calico-system calico-node-dtbcq 1/1 Running 0 21h 10.xxx.xxx.xxx instance-20250503-060921 + calico-system calico-node-jptsp 1/1 Running 0 33m 10.xxx.xxx.xxx insudevmachine + calico-system calico-typha-b7d75bc58-h6vrb 1/1 Running 0 37m 10.xxx.xxx.xxx instance-20250503-060921 + calico-system csi-node-driver-884sn 2/2 Running 0 26m 192.168.165.193 insudevmachine + calico-system csi-node-driver-bb7dl 2/2 Running 0 21h 192.168.190.1 instance-20250503-060921 + calico-system goldmane-7b5b4cd5d9-6bk5p 1/1 Running 0 21h 192.168.190.6 instance-20250503-060921 + calico-system whisker-5dbf545674-hnkpz 2/2 Running 0 21h 192.168.190.8 instance-20250503-060921 + ... + kube-system coredns-668d6bf9bc-5hvx7 1/1 Running 0 21h 192.168.190.3 instance-20250503-060921 + kube-system coredns-668d6bf9bc-wb7qq 1/1 Running 0 21h 192.168.190.5 instance-20250503-060921 + ``` + + Ensure that the status of each node is marked as “Ready” and that the CoreDNS pods are running: + + ```bash + kubectl get nodes + + # NAME STATUS ROLES AGE VERSION + # instance-20250503-060921 Ready control-plane 21h v1.32.4 + # insudevmachine Ready 37m v1.32.4 + + kubectl get pods -n kube-system | grep -i coredns + + # coredns-668d6bf9bc-5hvx7 1/1 Running 0 21h + # coredns-668d6bf9bc-wb7qq 1/1 Running 0 21h + ``` + +5. **Explanation:** + This script downloads version 3.30.0 of [`calico`](https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart), a container network interface (CNI) plugin for Kubernetes clusters. + +### Step 6: Installing nvidia device plugin + +1. Access to a bare-metal server that will serve as the control plane node. + +2. Clone the repository and navigate to the [`utils/`](../utils/) folder: + + ```bash + git clone https://github.com/vllm-project/production-stack.git + cd production-stack/utils + ``` + +3. Execute the script [`init-nvidia-gpu-setup-k8s.sh`](../utils/init-nvidia-gpu-setup-k8s.sh): + + ```bash + bash init-nvidia-gpu-setup-k8s.sh + ``` + +4. **Explanation:** + - Configures the system to support GPU workloads by enabling the NVIDIA Container Toolkit and starting Minikube with GPU support. + - Installs the NVIDIA `gpu-operator` chart to manage GPU resources within the cluster. + +5. **Expected Output:** + If everything goes smoothly, you should see the example output like following: + + ```plaintext + ... + NAME: gpu-operator-1737507918 + LAST DEPLOYED: Wed Jan 22 01:05:21 2025 + NAMESPACE: gpu-operator + STATUS: deployed + REVISION: 1 + TEST SUITE: None + ``` + +6. Some troubleshooting tips for installing gpu-operator: + + If gpu-operator fails to start because of the common seen “too many open files” issue for minikube (and [kind](https://kind.sigs.k8s.io/)), then a quick fix below may be helpful. + + The issue can be observed by one or more gpu-operator pods in `CrashLoopBackOff` status, and be confirmed by checking their logs. For example, + + ```console + $ kubectl -n gpu-operator logs daemonset/nvidia-device-plugin-daemonset -c nvidia-device-plugin + IS_HOST_DRIVER=true + NVIDIA_DRIVER_ROOT=/ + DRIVER_ROOT_CTR_PATH=/host + NVIDIA_DEV_ROOT=/ + DEV_ROOT_CTR_PATH=/host + Starting nvidia-device-plugin + I0131 19:35:42.895845 1 main.go:235] "Starting NVIDIA Device Plugin" version=< + d475b2cf + commit: d475b2cfcf12b983a4975d4fc59d91af432cf28e + > + I0131 19:35:42.895917 1 main.go:238] Starting FS watcher for /var/lib/kubelet/device-plugins + E0131 19:35:42.895933 1 main.go:173] failed to create FS watcher for /var/lib/kubelet/device-plugins/: too many open files + ``` + + The fix is [well documented](https://kind.sigs.k8s.io/docs/user/known-issues#pod-errors-due-to-too-many-open-files) by kind, it also works for minikube. + +## Conclusion + +By completing this tutorial, you have successfully established a multi-node Kubernetes environment with GPU support on your servers. You are now prepared to deploy and test the vLLM Production Stack within this Kubernetes cluster. For additional configuration and workload-specific guidance, please refer to the official documentation for `kubectl`, `helm`, and `minikube`. + +What's next: + +- [00-b-install-kuberay-operator](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-b-install-kuberay-operator.md) diff --git a/tutorials/00-b-install-kuberay-operator.md b/tutorials/00-b-install-kuberay-operator.md new file mode 100644 index 000000000..470521e1c --- /dev/null +++ b/tutorials/00-b-install-kuberay-operator.md @@ -0,0 +1,85 @@ +# Tutorial: Setting Up a Kuberay Operator on Your Kubernetes Environment + +## Introduction + +This tutorial provides a step-by-step guide to installing and configuring the KubeRay operator within a Kubernetes environment. We will use the helm chart to set up kuberay, enabling distributed inference with vLLM. By the end of this tutorial, you will have a fully operational KubeRay operator ready to support the deployment of the vLLM Production Stack. + +## Table of Contents + +- [Introduction](#introduction) +- [Table of Contents](#table-of-contents) +- [Prerequisites](#prerequisites) +- [Steps](#steps) + - [Step 1: Install the KubeRay Operator Using Helm](#step-1-install-the-kuberay-operator-using-helm) + - [Step 2: Verify the KubeRay Configuration](#step-2-verify-the-kuberay-configuration) + +## Prerequisites + +Before you begin, ensure the following: + +1. **GPU Server Requirements:** + - A server with a GPU and drivers properly installed (e.g., NVIDIA drivers). + - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed for GPU workloads. + +2. **Access and Permissions:** + - Root or administrative access to the server. + - Internet connectivity to download required packages and tools. + +3. **Environment Setup:** + - A Linux-based operating system (e.g., Ubuntu 20.04 or later). + - Basic understanding of Linux shell commands. + +4. **Kubernetes Installation:** + - To quickly and easily set up a single-node Kubernetes environment, you may install Minikube by following the instructions provided in[`00-install-kubernetes-env.md`](00-install-kubernetes-env.md). + - For setting up a multi-node cluster or a more generalized Kubernetes environment, you may install Kubernetes from scratch using Kubeadm. This involves configuring the container runtime and container network interface (CNI), as outlined in [`00-a-install-multinode-kubernetes-env.md`](00-a-install-multinode-kubernetes-env.md) + - If you already have a running Kubernetes cluster, you may skip this step. + +5. **Kuberay Concept Review:** + - Review the [`official KubeRay documentation`](https://docs.ray.io/en/latest/cluster/kubernetes/index.html) for additional context and best practices. + +## Steps + +### Step 1: Install the KubeRay Operator Using Helm + +1. Add the KubeRay Helm repository: + + ```bash + helm repo add kuberay https://ray-project.github.io/kuberay-helm/ + helm repo update + ``` + +2. Install the Custom Resource Definitions (CRDs) and the KubeRay operator (version 1.2.0) in the default namespace: + + ```bash + helm install kuberay-operator kuberay/kuberay-operator --version 1.2.0 + ``` + +3. **Explanation:** + This step deploys the stable KubeRay operator in your Kubernetes cluster. The operator is essential for managing Ray clusters and enables you to scale multiple vLLM instances for distributed inference workloads. + +### Step 2: Verify the KubeRay Configuration + +1. **Check the Operator Pod Status:** + - Ensure that the KubeRay operator pod is running in the default namespace: + + ```bash + kubectl get pods + ``` + +2. **Expected Output:** + Example output: + + ```plaintext + NAME READY STATUS RESTARTS AGE + kuberay-operator-975995b7d-75jqd 1/1 Running 0 25h + ``` + +## Conclusion + +You have now successfully installed and verified the KubeRay operator in your Kubernetes environment. This setup lays the foundation for deploying and managing the vLLM Production Stack for distributed inference or training workloads. + +For advanced configurations and workload-specific tuning, refer to the official documentation for kuberay, kubectl, helm, and minikube. + +What's next: + +- [15-basic-pipeline-parallel](https://github.com/vllm-project/production-stack/blob/main/tutorials/15-basic-pipeline-parallel.md) diff --git a/tutorials/15-basic-pipeline-parallel.md b/tutorials/15-basic-pipeline-parallel.md new file mode 100644 index 000000000..2d5b6d0b0 --- /dev/null +++ b/tutorials/15-basic-pipeline-parallel.md @@ -0,0 +1,309 @@ +# Tutorial: Basic vLLM Configurations + +## Introduction + +This tutorial provides a step-by-step guide for configuring and deploying the vLLM serving engine on a multi-node Kubernetes cluster with support for distributed inference using KubeRay. It also explains how to launch the vLLM serving engine with pipeline parallelism enabled. + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Step 1: Basic explanation of Ray and Kuberay](#step-1-basic-explanation-of-ray-and-kuberay) +3. [Step 2: Preparing the Configuration File](#step-2-preparing-the-configuration-file) +4. [Step 3: Applying the Configuration](#step-3-applying-the-configuration) +5. [Step 4: Verifying the Ray Cluster](#step-4-verifying-the-deployment) + +## Prerequisites + +- A Kubernetes cluster with multiple nodes with GPU support, as set up in the [00-a-install-multinode-kubernetes-env tutorial](00-a-install-multinode-kubernetes-env.md). +- Install kuberay operator on the Kubernetes environment with [00-b-install-kuberay-operator tutorial](00-b-install-kuberay-operator.md). +- Helm installed on your system. +- Access to a HuggingFace token (`HF_TOKEN`). +- A basic understanding of Ray is recommended. For more information, refer to the [official ray documentation](https://docs.ray.io/en/latest/cluster/kubernetes/index.html). + +## Step 1: Basic explanation of Ray and Kuberay + +1. Ray is a framework designed for distributed workloads, such as distributed training and inference. It operates by running multiple processes—typically containers or pods—to distribute and synchronize tasks efficiently. + +2. Ray organizes these processes into a Ray cluster, which consists of a single head node and multiple worker nodes. The term "node" here refers to a logical process, which can be deployed as a container or pod. + +3. KubeRay is a Kubernetes operator that simplifies the creation and management of Ray clusters within a Kubernetes environment. Without KubeRay, setting up Ray nodes requires manual configuration. + +4. Using KubeRay, you can easily deploy Ray clusters on Kubernetes. These clusters enable distributed inference with vLLM, supporting both tensor parallelism and pipeline parallelism. + +## Step 2: Preparing the Configuration File + +1. Locate the example configuration file [`tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml`](assets/values-15-minimal-pipeline-parallel-example.yaml). + +2. Open the file and update the following fields: + +- Write your actual huggingface token in `hf_token: ` in the yaml file. + +### Explanation of Key Items in `values-15-minimal-pipeline-parallel-example.yaml` + +- **`raySpec`**: Required when using KubeRay to enable pipeline parallelism. +- **`headNode`**: Specifies the resource requirements for the Kuberay head node and must be defined accordingly: + - **`requestCPU`**: The amount of CPU resources requested for Kuberay head pod. + - **`requestMemory`**: Memory allocation for Kuberay head pod. Sufficient memory is required to load the model. + - **`requestGPU`**: Defines the number of GPUs to allocate for the KubeRay head pod. Currently, the Ray head node must also participate in both tensor parallelism and pipeline parallelism. This requirement exists because the `vllm serve ...` command is executed on the Ray head node, and vLLM mandates that the pod where this command is run must have at least one visible GPU. +- **`name`**: The unique identifier for your model deployment. +- **`repository`**: The Docker repository containing the model's serving engine image. +- **`tag`**: Specifies the version of the model image to use. +- **`modelURL`**: The URL pointing to the model on Hugging Face or another hosting service. +- **`replicaCount`**: The number of total Kuberay worker pods. +- **`requestCPU`**: The amount of CPU resources requested per Kuberay worker pod. +- **`requestMemory`**: Memory allocation for each Kuberay worker pod. Sufficient memory is required to load the model. +- **`requestGPU`**: Specifies the number of GPUs to allocate for each Kuberay worker pod. +- **`vllmConfig`**: Contains model-specific configurations: + - `tensorParallelSize`: Specifies the number of GPUs assigned to each worker pod. This value must be identical to both `requestGPU` and `raySpec.headNode.requestGPU`. + - `pipelineParallelSize`: Indicates the level of pipeline parallelism. This value must be equal to `replicaCount + 1`, representing the total number of Ray cluster nodes, including both head and worker nodes. + - **Important Note:** + - The total number of GPUs required is computed as `pipelineParallelSize × tensorParallelSize`. + - This total must exactly match the sum of: + - `replicaCount × requestGPU` (the total number of GPUs allocated to Ray worker nodes), and + - `raySpec.headNode.requestGPU` (the number of GPUs allocated to the Ray head node). + - The `requestGPU` value for the Ray head node must be identical to that of each worker node. + - `tensorParallelSize` defines the number of GPUs allocated per Ray node (including both head and worker nodes), and must be consistent across all nodes. + - `pipelineParallelSize` represents the total number of Ray nodes, and must therefore be set to replicaCount + 1 (i.e., the number of worker nodes plus the head node). +- **`shmSize`**: Configures the shared memory size to ensure adequate memory is available for inter-process communication during tensor and pipeline parallelism execution. +- **`hf_token`**: The Hugging Face token for authenticating with the Hugging Face model hub. + +### Example Snippet + +In the following example, we configure a total of two Ray nodes each equipped with two GPUs (one head node and one worker node) to serve a distilgpt2 model. We set the tensor parallelism size to 2, as each node contains two GPUs, and the pipeline parallelism size to 2, corresponding to the two Ray nodes being utilized. + +```yaml +servingEngineSpec: + runtimeClassName: "" + raySpec: + headNode: + requestCPU: 2 + requestMemory: "20Gi" + requestGPU: 2 + modelSpec: + - name: "distilgpt2" + repository: "vllm/vllm-openai" + tag: "latest" + modelURL: "distilbert/distilgpt2" + + replicaCount: 1 + + requestCPU: 2 + requestMemory: "20Gi" + requestGPU: 2 + + vllmConfig: + tensorParallelSize: 2 + pipelineParallelSize: 2 + + shmSize: "20Gi" + + hf_token: +``` + +## Step 3: Applying the Configuration + +Deploy the configuration using Helm: + +```bash +helm repo add vllm https://vllm-project.github.io/production-stack +helm install vllm vllm/vllm-stack -f tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml +``` + +Expected output: + +You should see output indicating the successful deployment of the Helm chart: + +```plaintext +NAME: vllm +LAST DEPLOYED: Sun May 11 15:10:34 2025 +NAMESPACE: default +STATUS: deployed +REVISION: 1 +TEST SUITE: None +``` + +## Step 4: Verifying the Deployment + +1. Check the status of the pods: + + ```bash + kubectl wait --for=condition=ready pod -l environment=router,release=router --namespace=default --timeout=60s && \ + kubectl get pods + ``` + + Expected output: + + You should see the following pods: + + ```plaintext + pod/vllm-deployment-router-8666bf6464-v97v8 condition met + NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES + kuberay-operator-f89ddb644-858bw 1/1 Running 0 12h 192.168.165.203 insudevmachine + vllm-deployment-router-8666bf6464-v97v8 1/1 Running 0 12h 192.168.165.206 insudevmachine + vllm-distilgpt2-raycluster-head-wvqj5 1/1 Running 0 12h 192.168.190.20 instance-20250503-060921 + vllm-distilgpt2-raycluster-ray-worker-fdvnh 1/1 Running 0 12h 192.168.165.207 insudevmachine + ``` + + - In this example, the production stack is deployed in a Kubernetes environment consisting of two nodes, each equipped with two GPUs. + + - The Ray head and worker nodes are scheduled on separate nodes. A total of four GPUs are utilized, with each node contributing two GPUs. + + - The vllm-deployment-router pod functions as the request router, directing incoming traffic to the appropriate model-serving pod. + + - The vllm-distilgpt2-raycluster-head pod is responsible for running the primary vLLM command. + + - The vllm-distilgpt2-raycluster-ray-worker-* pods serve the model and handle inference requests. + +2. Verify the service is exposed correctly: + + ```bash + kubectl get services + ``` + + Expected output: + + Ensure there are services for both the serving engine and the router: + + ```plaintext + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + kuberay-operator ClusterIP 10.97.0.153 8080/TCP 13h + kubernetes ClusterIP 10.96.0.1 443/TCP 13h + vllm-distilgpt2-engine-service ClusterIP 10.106.237.111 80/TCP 12h + vllm-distilgpt2-raycluster-head-svc ClusterIP None 8000/TCP,8080/TCP 12h + vllm-router-service ClusterIP 10.97.229.184 80/TCP 12h + ``` + + - The `vllm-*-engine-service` exposes the head node of the ray cluster. + - The `vllm-*-router-service` handles routing and load balancing across model-serving pods. + +3. Test the health endpoint: + + To verify that the service is operational, execute the following commands: + + ```bash + kubectl port-forward svc/vllm-router-service 30080:80 + curl http://localhost:30080/v1/models + ``` + + **Note:** Port forwarding must be performed from a separate shell session. If the deployment is configured correctly, you should receive a response similar to the following: + + ```plaintext + { + "object": "list", + "data": [ + { + "id": "distilbert/distilgpt2", + "object": "model", + "created": 1747465656, + "owned_by": "vllm", + "root": null + } + ] + } + ``` + + You may also perform a basic inference test to validate that pipeline parallelism is functioning as expected. Use the following curl command: + + ```bash + curl -X POST http://localhost:30080/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "distilbert/distilgpt2", + "prompt": "Once upon a time,", + "max_tokens": 10 + }' + ``` + + A successful response should resemble the following output: + + ```plaintext + { + "id": "cmpl-92c4ceef0f1c42c9bba10da8306bf86c", + "object": "text_completion", + "created": 1747465724, + "model": "distilbert/distilgpt2", + "choices": [ + { + "index": 0, + "text": "? Huh, are you all red?\n\n", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "usage": { + "prompt_tokens": 5, + "total_tokens": 15, + "completion_tokens": 10, + "prompt_tokens_details": null + } + } + ``` + + You can also monitor GPU usage for each Ray head and worker pod: + + ```plaintext + kubectl exec -it vllm-distilgpt2-raycluster-head-wvqj5 -- /bin/bash + root@vllm-distilgpt2-raycluster-head-wvqj5:/vllm-workspace# nvidia-smi + Sat May 17 00:10:48 2025 + +-----------------------------------------------------------------------------------------+ + | NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 | + |-----------------------------------------+------------------------+----------------------+ + | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | + | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | + | | | MIG M. | + |=========================================+========================+======================| + | 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 | + | N/A 76C P0 35W / 72W | 20313MiB / 23034MiB | 0% Default | + | | | N/A | + +-----------------------------------------+------------------------+----------------------+ + | 1 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 | + | N/A 70C P0 33W / 72W | 20305MiB / 23034MiB | 0% Default | + | | | N/A | + +-----------------------------------------+------------------------+----------------------+ + + +-----------------------------------------------------------------------------------------+ + | Processes: | + | GPU GI CI PID Type Process name GPU Memory | + | ID ID Usage | + |=========================================================================================| + | 0 N/A N/A 8 C /usr/bin/python3 0MiB | + | 1 N/A N/A 1082 C ray::RayWorkerWrapper 0MiB | + +-----------------------------------------------------------------------------------------+ + + ########################################################################################### + + kubectl exec -it vllm-distilgpt2-raycluster-ray-worker-fdvnh -- /bin/bash + Defaulted container "vllm-ray-worker" out of: vllm-ray-worker, wait-gcs-ready (init) + root@vllm-distilgpt2-raycluster-ray-worker-fdvnh:/vllm-workspace# nvidia-smi + Sat May 17 00:12:06 2025 + +-----------------------------------------------------------------------------------------+ + | NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 | + |-----------------------------------------+------------------------+----------------------+ + | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | + | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | + | | | MIG M. | + |=========================================+========================+======================| + | 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 | + | N/A 76C P0 40W / 72W | 20065MiB / 23034MiB | 0% Default | + | | | N/A | + +-----------------------------------------+------------------------+----------------------+ + | 1 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 | + | N/A 72C P0 38W / 72W | 20063MiB / 23034MiB | 0% Default | + | | | N/A | + +-----------------------------------------+------------------------+----------------------+ + + +-----------------------------------------------------------------------------------------+ + | Processes: | + | GPU GI CI PID Type Process name GPU Memory | + | ID ID Usage | + |=========================================================================================| + | 0 N/A N/A 243 C ray::RayWorkerWrapper 0MiB | + | 1 N/A N/A 244 C ray::RayWorkerWrapper 0MiB | + +-----------------------------------------------------------------------------------------+ + ``` + +## Conclusion + +In this tutorial, you configured and deployed the vLLM serving engine with support for pipeline parallelism across multiple GPUs within a multi-node Kubernetes environment using KubeRay. Additionally, you learned how to verify the deployment and monitor the associated pods to ensure proper operation. For further customization and configuration options, please consult the `values.yaml` file and the Helm chart documentation. diff --git a/tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml b/tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml new file mode 100644 index 000000000..62dbf8d54 --- /dev/null +++ b/tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml @@ -0,0 +1,24 @@ +servingEngineSpec: + runtimeClassName: "" + raySpec: + headNode: + requestCPU: 2 + requestMemory: "20Gi" + requestGPU: 2 + modelSpec: + - name: "distilgpt2" + repository: "vllm/vllm-openai" + tag: "latest" + modelURL: "distilbert/distilgpt2" + + replicaCount: 1 + + requestCPU: 2 + requestMemory: "20Gi" + requestGPU: 2 + + vllmConfig: + tensorParallelSize: 2 + pipelineParallelSize: 2 + + shmSize: "20Gi" diff --git a/utils/init-nvidia-gpu-setup-k8s.sh b/utils/init-nvidia-gpu-setup-k8s.sh new file mode 100755 index 000000000..c49d4144e --- /dev/null +++ b/utils/init-nvidia-gpu-setup-k8s.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -e + +# Allow users to override the paths for the NVIDIA tools. +: "${NVIDIA_SMI_PATH:=nvidia-smi}" +: "${NVIDIA_CTK_PATH:=nvidia-ctk}" + +# --- Debug and Environment Setup --- +echo "Current PATH: $PATH" +echo "Operating System: $(uname -a)" + +# Get the script directory to reference local scripts reliably. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# --- Install Prerequisites --- +echo "Installing kubectl and helm..." +bash "$SCRIPT_DIR/install-kubectl.sh" +bash "$SCRIPT_DIR/install-helm.sh" + +# --- Configure BPF (if available) --- +if [ -f /proc/sys/net/core/bpf_jit_harden ]; then + echo "Configuring BPF: Setting net.core.bpf_jit_harden=0" + if ! grep -q "net.core.bpf_jit_harden=0" /etc/sysctl.conf; then + echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf + fi + sudo sysctl -p +else + echo "BPF JIT hardening configuration not available, skipping..." +fi + +# --- NVIDIA GPU Setup --- +GPU_AVAILABLE=false +if command -v "$NVIDIA_SMI_PATH" >/dev/null 2>&1; then + echo "NVIDIA GPU detected via nvidia-smi at: $(command -v "$NVIDIA_SMI_PATH")" + if command -v "$NVIDIA_CTK_PATH" >/dev/null 2>&1; then + echo "nvidia-ctk found at: $(command -v "$NVIDIA_CTK_PATH")" + GPU_AVAILABLE=true + else + echo "nvidia-ctk not found. Please install the NVIDIA Container Toolkit to enable GPU support." + fi +fi + +if [ "$GPU_AVAILABLE" = true ]; then + # Configure Docker for GPU support. + echo "Configuring Docker runtime for GPU support..." + if sudo "$NVIDIA_CTK_PATH" runtime configure --runtime=docker; then + echo "Restarting Docker to apply changes..." + echo "WARNING: Restarting Docker will stop and restart all containers." + sudo systemctl restart docker + echo "Docker runtime configured successfully." + else + echo "Error: Failed to configure Docker runtime using the NVIDIA Container Toolkit." + exit 1 + fi + + # Install the GPU Operator via Helm. + echo "Adding NVIDIA helm repo and updating..." + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update + echo "Installing GPU Operator..." + helm install --wait gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator --version=v24.9.1 +fi + +echo "NVIDIA GPU Setup complete." diff --git a/utils/install-calico.sh b/utils/install-calico.sh new file mode 100755 index 000000000..e8bf8608c --- /dev/null +++ b/utils/install-calico.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Refer to https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart +# for more information. + +# Install the Tigera operator and custom resource definitions: +kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.30.0/manifests/tigera-operator.yaml + +# Install Calico by creating the necessary custom resources: +kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.30.0/manifests/custom-resources.yaml diff --git a/utils/install-cri-o.sh b/utils/install-cri-o.sh new file mode 100755 index 000000000..87c7a7c7f --- /dev/null +++ b/utils/install-cri-o.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Refer to https://github.com/cri-o/packaging/blob/main/README.md#distributions-using-deb-packages +# and +# https://github.com/cri-o/cri-o/blob/main/contrib/cni/README.md#configuration-directory +# for more information. + +# Install the dependencies for adding repositories +sudo apt-get update +sudo apt-get install -y software-properties-common curl + +export CRIO_VERSION=v1.32 + +# Add the CRI-O repository +curl -fsSL https://download.opensuse.org/repositories/isv:/cri-o:/stable:/$CRIO_VERSION/deb/Release.key | + sudo gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg + +echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://download.opensuse.org/repositories/isv:/cri-o:/stable:/$CRIO_VERSION/deb/ /" | + sudo tee /etc/apt/sources.list.d/cri-o.list + +# Install the packages +sudo apt-get update +sudo apt-get install -y cri-o + +# Update crio config by creating (or editing) /etc/crio/crio.conf +sudo tee /etc/crio/crio.conf > /dev/null <