diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
index 7459c508f..fd2b2d800 100644
--- a/helm/templates/deployment-vllm-multi.yaml
+++ b/helm/templates/deployment-vllm-multi.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.servingEngineSpec.enableEngine -}}
+{{- if and .Values.servingEngineSpec.enableEngine (not (hasKey .Values.servingEngineSpec "raySpec")) -}}
 {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }}
 {{- $kv_role := "kv_both" }}
 {{- $kv_rank := 0 }}
diff --git a/helm/templates/ray-cluster.yaml b/helm/templates/ray-cluster.yaml
new file mode 100644
index 000000000..3a1454c30
--- /dev/null
+++ b/helm/templates/ray-cluster.yaml
@@ -0,0 +1,620 @@
+{{- if and .Values.servingEngineSpec.enableEngine (hasKey .Values.servingEngineSpec "raySpec")}}
+{{- range $modelSpec := .Values.servingEngineSpec.modelSpec }}
+{{- with $ -}}
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: "{{ .Release.Name }}-{{$modelSpec.name}}-raycluster"
+  namespace: {{ .Release.Namespace }}
+  labels:
+    model: {{ $modelSpec.name }}
+    helm-release-name: {{ .Release.Name }}
+    {{- include "chart.engineLabels" . | nindent 4 }}
+spec:
+  headGroupSpec:
+    serviceType: ClusterIP
+    rayStartParams:
+      dashboard-host: "0.0.0.0"
+    template:
+      metadata:
+        labels:
+          model: {{ $modelSpec.name }}
+          helm-release-name: {{ .Release.Name }}
+          {{- include "chart.engineLabels" . | nindent 10 }}
+      spec:
+        terminationGracePeriodSeconds: 0
+        {{- if .Values.servingEngineSpec.securityContext }}
+        securityContext:
+          {{- toYaml .Values.servingEngineSpec.securityContext | nindent 10 }}
+        {{- end }}
+        containers:
+          - name: vllm-ray-head
+            image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}"
+            command:
+              - >-
+                /bin/bash -c "
+                cp /entrypoint/vllm-entrypoint.sh \$HOME/vllm-entrypoint.sh &&
+                chmod +x \$HOME/vllm-entrypoint.sh &&
+                \$HOME/vllm-entrypoint.sh &
+                echo \"Running vllm command in the background.\""
+            env:
+              - name: VLLM_HOST_IP
+                valueFrom:
+                  fieldRef:
+                    fieldPath: status.podIP
+              - name: EXPECTED_NODES
+                value: "{{ add $modelSpec.replicaCount 1}}"
+              - name: HF_HOME
+                {{- if hasKey $modelSpec "pvcStorage" }}
+                value: /data
+                {{- else }}
+                value: /tmp
+                {{- end }}
+              {{- with $modelSpec.vllmConfig}}
+              - name: LMCACHE_LOG_LEVEL
+                value: "DEBUG"
+              {{- if hasKey . "v1" }}
+              - name: VLLM_USE_V1
+                value: {{ $modelSpec.vllmConfig.v1 | quote }}
+              {{- else }}
+              - name: VLLM_USE_V1
+                value: "0"
+              {{- end}}
+              {{- end}}
+              {{- if $modelSpec.hf_token }}
+              - name: HF_TOKEN
+                {{- if kindIs "string" $modelSpec.hf_token }}
+                valueFrom:
+                  secretKeyRef:
+                    name: {{ .Release.Name }}-secrets
+                    key: hf_token_{{ $modelSpec.name }}
+                {{- else }}
+                valueFrom:
+                  secretKeyRef:
+                    name: {{ $modelSpec.hf_token.secretName }}
+                    key: {{ $modelSpec.hf_token.secretKey }}
+                {{- end }}
+              {{- end }}
+              {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }}
+              {{- if $vllmApiKey }}
+              - name: VLLM_API_KEY
+                {{- if kindIs "string" $vllmApiKey }}
+                valueFrom:
+                  secretKeyRef:
+                    name: {{ .Release.Name }}-secrets
+                    key: vllmApiKey
+                {{- else }}
+                valueFrom:
+                  secretKeyRef:
+                    name: {{ $vllmApiKey.secretName }}
+                    key: {{ $vllmApiKey.secretKey }}
+                {{- end }}
+              {{- end }}
+              {{- with $modelSpec.env }}
+              {{- toYaml . | nindent 10 }}
+              {{- end }}
+              {{- if $modelSpec.lmcacheConfig }}
+              {{-   if $modelSpec.lmcacheConfig.enabled }}
+              - name: LMCACHE_USE_EXPERIMENTAL
+                value: "True"
+              - name: VLLM_RPC_TIMEOUT
+                value: "1000000"
+              {{-   end }}
+              {{-   if $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}
+              - name: LMCACHE_LOCAL_CPU
+                value: "True"
+              - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+                value: "{{ $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}"
+              {{-   end }}
+              {{-   if $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}
+              - name: LMCACHE_LOCAL_DISK
+                value: "True"
+              - name: LMCACHE_MAX_LOCAL_DISK_SIZE
+                value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}"
+              {{-   end }}
+              {{-   if .Values.cacheserverSpec }}
+              - name: LMCACHE_REMOTE_URL
+                value: "{{ include "cacheserver.formatRemoteUrl" (dict "service_name" (print .Release.Name "-cache-server-service") "port" .Values.cacheserverSpec.servicePort) }}"
+              - name: LMCACHE_REMOTE_SERDE
+                value: "{{ .Values.cacheserverSpec.serde }}"
+              {{-   end }}
+              {{-   if hasKey $modelSpec.lmcacheConfig "enableController" }}
+              - name: LMCACHE_ENABLE_CONTROLLER
+                value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.enableController | quote }}
+              {{-   end }}
+              {{-   if hasKey $modelSpec.lmcacheConfig "instanceId" }}
+              - name: LMCACHE_INSTANCE_ID
+                value: {{ $modelSpec.lmcacheConfig.instanceId | quote }}
+              {{-   end }}
+              {{-   if hasKey $modelSpec.lmcacheConfig "controllerPort" }}
+              - name: LMCACHE_CONTROLLER_URL
+                value: "{{ .Release.Name }}-{{$modelSpec.name}}-service:{{ $modelSpec.lmcacheConfig.controllerPort }}"
+              {{-   end }}
+              {{-   if hasKey $modelSpec.lmcacheConfig "workerPort" }}
+              - name: LMCACHE_WORKER_PORT
+                value: "{{ .Release.Name }}-service:{{ $modelSpec.lmcacheConfig.workerPort }}"
+              {{-   end }}
+              {{- end }}
+            {{- if .Values.servingEngineSpec.configs }}
+            envFrom:
+              - configMapRef:
+                  name: "{{ .Release.Name }}-configs"
+            {{- end }}
+            ports:
+              - name: {{ include "chart.container-port-name" . }}
+                containerPort: {{ include "chart.container-port" . }}
+            readinessProbe:
+              httpGet:
+                path: /health
+                port: {{ include "chart.container-port" . }}
+              failureThreshold: 1
+              periodSeconds: 10
+            livenessProbe:
+              exec:
+                command: ["/bin/bash", "-c", "echo TBD"]
+            resources:
+              limits:
+                cpu: {{ default "2" .Values.servingEngineSpec.raySpec.headNode.requestCPU }}
+                memory: {{ default "8Gi" .Values.servingEngineSpec.raySpec.headNode.requestMemory }}
+                {{- if hasKey .Values.servingEngineSpec.raySpec.headNode "requestGPU" }}
+                nvidia.com/gpu: {{ .Values.servingEngineSpec.raySpec.headNode.requestGPU }}
+                {{- end }}
+            startupProbe:
+              exec:
+                command: ["/bin/bash", "-c", "python3 /scripts/wait_for_ray.py"]
+              failureThreshold: 30
+              periodSeconds: 15
+              timeoutSeconds: 10
+            volumeMounts:
+              - name: wait-script
+                mountPath: /scripts
+              - name: vllm-script
+                mountPath: /entrypoint
+              {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumeMounts") }}
+              {{- if hasKey $modelSpec "pvcStorage" }}
+              - name: {{ .Release.Name }}-storage
+                mountPath: /data
+              {{- end }}
+              {{- with $modelSpec.vllmConfig }}
+              {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
+              - name: shm
+                mountPath: /dev/shm
+              {{- end}}
+              {{- end}}
+              {{- if $modelSpec.chatTemplate }}
+              - name: vllm-templates
+                mountPath: /templates
+              {{- end }}
+              {{- if hasKey $modelSpec "extraVolumeMounts" }}
+              {{- toYaml $modelSpec.extraVolumeMounts | nindent 14 }}
+              {{- end }}
+              {{- end }}
+        volumes:
+          - name: wait-script
+            configMap:
+              name: wait-for-ray-script
+          - name: vllm-script
+            configMap:
+              name: vllm-start-script
+          {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumes") }}
+          {{- if hasKey $modelSpec "pvcStorage" }}
+          - name: {{ .Release.Name }}-storage
+            persistentVolumeClaim:
+              claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim"
+          {{- end }}
+          {{- with $modelSpec.vllmConfig }}
+          {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
+          - name: shm
+            emptyDir:
+              medium: Memory
+              sizeLimit: {{ default "20Gi" $modelSpec.shmSize }}
+          {{- end}}
+          {{- end}}
+          {{- if $modelSpec.chatTemplate}}
+          {{- if hasKey $modelSpec "chatTemplateConfigMap" }}
+          - name: {{ .Release.Name }}-chat-templates
+            configMap:
+              name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
+          {{- else }}
+          - name: vllm-templates
+            persistentVolumeClaim:
+              claimName: vllm-templates-pvc
+          {{- end }}
+          {{- end}}
+          {{- if hasKey $modelSpec "extraVolumes" }}
+          {{- toYaml $modelSpec.extraVolumes | nindent 8 }}
+          {{- end}}
+        {{- end}}
+        {{- if $modelSpec.imagePullSecret }}
+        imagePullSecrets:
+          - name: {{ $modelSpec.imagePullSecret }}
+        {{- end }}
+        {{- if .Values.servingEngineSpec.tolerations }}
+        {{-   with .Values.servingEngineSpec.tolerations }}
+        tolerations:
+          {{-   toYaml . | nindent 8 }}
+        {{-   end }}
+        {{- end }}
+        {{- if .Values.servingEngineSpec.runtimeClassName }}
+        runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }}
+        {{- end }}
+        {{- if .Values.servingEngineSpec.schedulerName }}
+        schedulerName: {{ .Values.servingEngineSpec.schedulerName }}
+        {{- end }}
+        {{- if $modelSpec.nodeName }}
+        nodeName: {{ $modelSpec.nodeName }}
+        {{- else if $modelSpec.nodeSelectorTerms}}
+        affinity:
+          nodeAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+              nodeSelectorTerms:
+              {{- with $modelSpec.nodeSelectorTerms }}
+              {{- toYaml . | nindent 12 }}
+              {{- end }}
+        {{- end }}
+  workerGroupSpecs:
+    - rayStartParams: {}
+      replicas: {{ $modelSpec.replicaCount }}
+      groupName: ray
+      template:
+        metadata:
+          labels:
+            model: {{ $modelSpec.name }}
+            helm-release-name: {{ .Release.Name }}
+        {{- if .Values.servingEngineSpec.securityContext }}
+        securityContext:
+          {{- toYaml .Values.servingEngineSpec.securityContext | nindent 8 }}
+        {{- end }}
+        spec:
+          containers:
+            - name: vllm-ray-worker
+              image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}"
+              env:
+                - name: VLLM_HOST_IP
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: status.podIP
+                - name: HF_HOME
+                  {{- if hasKey $modelSpec "pvcStorage" }}
+                  value: /data
+                  {{- else }}
+                  value: /tmp
+                  {{- end }}
+                {{- with $modelSpec.vllmConfig}}
+                - name: LMCACHE_LOG_LEVEL
+                  value: "DEBUG"
+                {{- if hasKey . "v1" }}
+                - name: VLLM_USE_V1
+                  value: {{ $modelSpec.vllmConfig.v1 | quote }}
+                {{- else }}
+                - name: VLLM_USE_V1
+                  value: "0"
+                {{- end}}
+                {{- end}}
+                {{- if $modelSpec.hf_token }}
+                - name: HF_TOKEN
+                  {{- if kindIs "string" $modelSpec.hf_token }}
+                  valueFrom:
+                    secretKeyRef:
+                      name: {{ .Release.Name }}-secrets
+                      key: hf_token_{{ $modelSpec.name }}
+                  {{- else }}
+                  valueFrom:
+                    secretKeyRef:
+                      name: {{ $modelSpec.hf_token.secretName }}
+                      key: {{ $modelSpec.hf_token.secretKey }}
+                  {{- end }}
+                {{- end }}
+                {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }}
+                {{- if $vllmApiKey }}
+                - name: VLLM_API_KEY
+                  {{- if kindIs "string" $vllmApiKey }}
+                  valueFrom:
+                    secretKeyRef:
+                      name: {{ .Release.Name }}-secrets
+                      key: vllmApiKey
+                  {{- else }}
+                  valueFrom:
+                    secretKeyRef:
+                      name: {{ $vllmApiKey.secretName }}
+                      key: {{ $vllmApiKey.secretKey }}
+                  {{- end }}
+                {{- end }}
+                {{- with $modelSpec.env }}
+                {{- toYaml . | nindent 10 }}
+                {{- end }}
+                {{- if $modelSpec.lmcacheConfig }}
+                {{-   if $modelSpec.lmcacheConfig.enabled }}
+                - name: LMCACHE_USE_EXPERIMENTAL
+                  value: "True"
+                - name: VLLM_RPC_TIMEOUT
+                  value: "1000000"
+                {{-   end }}
+                {{-   if $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}
+                - name: LMCACHE_LOCAL_CPU
+                  value: "True"
+                - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+                  value: "{{ $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}"
+                {{-   end }}
+                {{-   if $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}
+                - name: LMCACHE_LOCAL_DISK
+                  value: "True"
+                - name: LMCACHE_MAX_LOCAL_DISK_SIZE
+                  value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}"
+                {{-   end }}
+                {{-   if .Values.cacheserverSpec }}
+                - name: LMCACHE_REMOTE_URL
+                  value: "{{ include "cacheserver.formatRemoteUrl" (dict "service_name" (print .Release.Name "-cache-server-service") "port" .Values.cacheserverSpec.servicePort) }}"
+                - name: LMCACHE_REMOTE_SERDE
+                  value: "{{ .Values.cacheserverSpec.serde }}"
+                {{-   end }}
+                {{-   if hasKey $modelSpec.lmcacheConfig "enableController" }}
+                - name: LMCACHE_ENABLE_CONTROLLER
+                  value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.enableController | quote }}
+                {{-   end }}
+                {{-   if hasKey $modelSpec.lmcacheConfig "instanceId" }}
+                - name: LMCACHE_INSTANCE_ID
+                  value: {{ $modelSpec.lmcacheConfig.instanceId | quote }}
+                {{-   end }}
+                {{-   if hasKey $modelSpec.lmcacheConfig "controllerPort" }}
+                - name: LMCACHE_CONTROLLER_URL
+                  value: "{{ .Release.Name }}-{{$modelSpec.name}}-service:{{ $modelSpec.lmcacheConfig.controllerPort }}"
+                {{-   end }}
+                {{-   if hasKey $modelSpec.lmcacheConfig "workerPort" }}
+                - name: LMCACHE_WORKER_PORT
+                  value: "{{ .Release.Name }}-service:{{ $modelSpec.lmcacheConfig.workerPort }}"
+                {{-   end }}
+                {{- end }}
+              {{- if .Values.servingEngineSpec.configs }}
+              envFrom:
+                - configMapRef:
+                    name: "{{ .Release.Name }}-configs"
+              {{- end }}
+              readinessProbe:
+                exec:
+                  command: ["/bin/bash", "-c", "echo TBD"]
+              livenessProbe:
+                exec:
+                  command: ["/bin/bash", "-c", "echo TBD"]
+              resources: {{- include "chart.resources" $modelSpec | nindent 16 }}
+              {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumeMounts") }}
+              volumeMounts:
+              {{- end }}
+              {{- if hasKey $modelSpec "pvcStorage" }}
+              - name: {{ .Release.Name }}-storage
+                mountPath: /data
+              {{- end }}
+              {{- with $modelSpec.vllmConfig }}
+              {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
+              - name: shm
+                mountPath: /dev/shm
+              {{- end}}
+              {{- end}}
+              {{- if $modelSpec.chatTemplate }}
+              - name: vllm-templates
+                mountPath: /templates
+              {{- end }}
+              {{- if hasKey $modelSpec "extraVolumeMounts" }}
+              {{- toYaml $modelSpec.extraVolumeMounts | nindent 14 }}
+              {{- end }}
+          volumes:
+            - name: wait-script
+              configMap:
+                name: wait-for-ray-script
+            - name: vllm-script
+              configMap:
+                name: vllm-start-script
+            {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumes") }}
+            {{- if hasKey $modelSpec "pvcStorage" }}
+            - name: {{ .Release.Name }}-storage
+              persistentVolumeClaim:
+                claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim"
+            {{- end }}
+            {{- with $modelSpec.vllmConfig }}
+            {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
+            - name: shm
+              emptyDir:
+                medium: Memory
+                sizeLimit: {{ default "20Gi" $modelSpec.shmSize }}
+            {{- end}}
+            {{- end}}
+            {{- if $modelSpec.chatTemplate}}
+            {{- if hasKey $modelSpec "chatTemplateConfigMap" }}
+            - name: {{ .Release.Name }}-chat-templates
+              configMap:
+                name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
+            {{- else }}
+            - name: vllm-templates
+              persistentVolumeClaim:
+                claimName: vllm-templates-pvc
+            {{- end }}
+            {{- end}}
+            {{- if hasKey $modelSpec "extraVolumes" }}
+            {{- toYaml $modelSpec.extraVolumes | nindent 8 }}
+            {{- end}}
+          {{- end}}
+          {{- if $modelSpec.imagePullSecret }}
+          imagePullSecrets:
+            - name: {{ $modelSpec.imagePullSecret }}
+          {{- end }}
+          {{- if .Values.servingEngineSpec.tolerations }}
+          {{-   with .Values.servingEngineSpec.tolerations }}
+          tolerations:
+            {{-   toYaml . | nindent 8 }}
+          {{-   end }}
+          {{- end }}
+
+          {{- if .Values.servingEngineSpec.runtimeClassName }}
+          runtimeClassName: {{ .Values.servingEngineSpec.runtimeClassName }}
+          {{- end }}
+          {{- if .Values.servingEngineSpec.schedulerName }}
+          schedulerName: {{ .Values.servingEngineSpec.schedulerName }}
+          {{- end }}
+          {{- if $modelSpec.nodeName }}
+          nodeName: {{ $modelSpec.nodeName }}
+          {{- else if $modelSpec.nodeSelectorTerms}}
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                {{- with $modelSpec.nodeSelectorTerms }}
+                {{- toYaml . | nindent 12 }}
+                {{- end }}
+          {{- end }}
+
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: wait-for-ray-script
+data:
+  wait_for_ray.py: |
+    import ray
+    import logging
+    import os
+    import sys
+
+    logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s')
+
+    try:
+        ray.init(address="auto")
+    except Exception as e:
+        logging.error(f"Failed to initialize Ray: {e}")
+        sys.exit(1)
+
+    expected_nodes = int(os.environ.get("EXPECTED_NODES", "1"))
+
+    alive_nodes = [n for n in ray.nodes() if n["Alive"]]
+    alive_count = len(alive_nodes)
+
+    logging.info(f"Ray cluster status: {alive_count}/{expected_nodes} nodes alive.")
+
+    if alive_count == expected_nodes:
+        logging.info("Cluster is ready.")
+        sys.exit(0)
+    else:
+        logging.info("Cluster is NOT ready.")
+        sys.exit(1)
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-start-script
+data:
+  vllm-entrypoint.sh: |
+    #!/bin/bash
+    set -e
+
+    echo "Waiting for Ray to become available..."
+    until python3 /scripts/wait_for_ray.py; do
+      echo "Ray not ready yet. Retrying in 2 seconds..."
+      sleep 2
+    done
+
+    echo "Ray is ready. Starting vLLM..."
+
+    # Start constructing command
+    ARGS=(
+      "vllm"
+      "serve"
+      "{{ $modelSpec.modelURL | quote }}"
+      "--host" "0.0.0.0"
+      "--port" "{{ include "chart.container-port" . }}"
+      "--distributed-executor-backend" "ray"
+    )
+
+    {{- if $modelSpec.enableLoRA }}
+    ARGS+=("--enable-lora")
+    {{- end }}
+
+    {{- if $modelSpec.enableTool }}
+    ARGS+=("--enable-auto-tool-choice")
+    {{- end }}
+
+    {{- if $modelSpec.toolCallParser }}
+    ARGS+=("--tool-call-parser" {{ $modelSpec.toolCallParser | quote }})
+    {{- end }}
+
+    {{- with $modelSpec.vllmConfig }}
+      {{- if hasKey . "enableChunkedPrefill" }}
+        {{- if .enableChunkedPrefill }}
+    ARGS+=("--enable-chunked-prefill")
+        {{- else }}
+    ARGS+=("--no-enable-chunked-prefill")
+        {{- end }}
+      {{- end }}
+
+      {{- if .enablePrefixCaching }}
+    ARGS+=("--enable-prefix-caching")
+      {{- end }}
+
+      {{- if hasKey . "maxModelLen" }}
+    ARGS+=("--max-model-len" {{ .maxModelLen | quote }})
+      {{- end }}
+
+      {{- if hasKey . "dtype" }}
+    ARGS+=("--dtype" {{ .dtype | quote }})
+      {{- end }}
+
+      {{- if hasKey . "tensorParallelSize" }}
+    ARGS+=("--tensor-parallel-size" {{ .tensorParallelSize | quote }})
+      {{- end }}
+
+      {{- if hasKey . "pipelineParallelSize" }}
+    ARGS+=("--pipeline-parallel-size" {{ .pipelineParallelSize | quote }})
+      {{- end }}
+
+      {{- if hasKey . "maxNumSeqs" }}
+    ARGS+=("--max-num-seqs" {{ .maxNumSeqs | quote }})
+      {{- end }}
+
+      {{- if hasKey . "gpuMemoryUtilization" }}
+    ARGS+=("--gpu-memory-utilization" {{ .gpuMemoryUtilization | quote }})
+      {{- end }}
+
+      {{- if hasKey . "maxLoras" }}
+    ARGS+=("--max-loras" {{ .maxLoras | quote }})
+      {{- end }}
+
+      {{- range .extraArgs }}
+    ARGS+=({{ . | quote }})
+      {{- end }}
+    {{- end }}
+
+    {{- if $modelSpec.lmcacheConfig }}
+      {{- if $modelSpec.lmcacheConfig.enabled }}
+        {{- if hasKey $modelSpec.vllmConfig "v1" }}
+          {{- if eq (toString $modelSpec.vllmConfig.v1) "1" }}
+    ARGS+=("--kv-transfer-config" "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}")
+          {{- else }}
+    ARGS+=("--kv-transfer-config" "{\"kv_connector\":\"LMCacheConnector\",\"kv_role\":\"kv_both\"}")
+          {{- end }}
+        {{- else }}
+    ARGS+=("--kv-transfer-config" "{\"kv_connector\":\"LMCacheConnector\",\"kv_role\":\"kv_both\"}")
+        {{- end }}
+      {{- end }}
+    {{- end }}
+
+    {{- if $modelSpec.chatTemplate }}
+    ARGS+=("--chat-template" {{ $modelSpec.chatTemplate | quote }})
+    {{- end }}
+
+    echo "Executing: ${ARGS[@]}"
+    exec "${ARGS[@]}"
+
+
+{{- if and $modelSpec.chatTemplate (hasKey $modelSpec "chatTemplateConfigMap") }}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
+  namespace: "{{ .Release.Namespace }}"
+data:
+  {{ $modelSpec.chatTemplate }}: |-
+    {{ $modelSpec.chatTemplateConfigMap }}
+{{- end }}
+{{- end }}
+---
+{{- end }}
+{{- end }}
diff --git a/tutorials/00-a-install-multinode-kubernetes-env.md b/tutorials/00-a-install-multinode-kubernetes-env.md
new file mode 100644
index 000000000..2f7dae565
--- /dev/null
+++ b/tutorials/00-a-install-multinode-kubernetes-env.md
@@ -0,0 +1,411 @@
+# Tutorial: Setting Up a Kubernetes Environment with GPUs on Your GPU Server
+
+## Introduction
+
+This tutorial provides a comprehensive guide to setting up a Kubernetes environment across multiple GPU-enabled servers. It covers the installation and configuration of `kubeadm`, `kubectl`, and `helm`, with a focus on ensuring GPU compatibility for workloads that require accelerated computing. By the end of this tutorial, you will have a fully operational multi-node Kubernetes cluster prepared for deploying the vLLM Production Stack.
+
+## Table of Contents
+
+- [Introduction](#introduction)
+- [Table of Contents](#table-of-contents)
+- [Prerequisites](#prerequisites)
+- [Steps](#steps)
+  - [Step 1: Installing kubeadm on each node](#step-1-installing-kubeadm-on-each-node)
+  - [Step 2: Installing container runtime on each node](#step-2-installing-container-runtime-on-each-node)
+  - [Step 3: Setting up a control plane node](#step-3-setting-up-a-control-plane-node)
+  - [Step 4: Setting and joining a worker node](#step-4-setting-and-joining-a-worker-node)
+  - [Step 5: Installing container network interface](#step-5-installing-container-network-interface)
+  - [Step 6: Installing nvidia device plugin](#step-6-installing-nvidia-device-plugin)
+
+## Prerequisites
+
+Before you begin, ensure the following:
+
+1. **GPU Server Requirements:**
+   - A server with a GPU and drivers properly installed (e.g., NVIDIA drivers).
+   - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed for GPU workloads.
+
+2. **Access and Permissions:**
+   - Root or administrative access to the server.
+   - Internet connectivity to download required packages and tools.
+
+3. **Environment Setup:**
+   - A Linux-based operating system (e.g., Ubuntu 20.04 or later).
+   - Basic understanding of Linux shell commands.
+
+4. **Tested Environment:**
+   - This guide was tested on a Debian 11 (Bullseye) operating system with 24 CPUs, 100 GiB of RAM, and 300 GiB of disk space. Please note that certain configurations or settings may vary or not function as expected on different systems, depending on your specific environment.
+
+## Steps
+
+### Step 1: Installing kubeadm on each node
+
+1. Access to a bare-metal server that will serve as the control plane node.
+
+2. Clone the repository and navigate to the [`utils/`](../utils/) folder:
+
+   ```bash
+   git clone https://github.com/vllm-project/production-stack.git
+   cd production-stack/utils
+   ```
+
+3. Execute the script [`install-kubeadm.sh`](../utils/install-kubeadm.sh):
+
+   ```bash
+   bash install-kubeadm.sh
+   ```
+
+4. **Expected Output:**
+   - Confirmation that `kubeadm` was downloaded and installed.
+   - Verification message using:
+
+     ```bash
+     kubeadm version
+     ```
+
+   Example output:
+
+   ```plaintext
+   kubeadm version: &version.Info{Major:"1", Minor:"32", GitVersion:"v1.32.4", GitCommit:"59526cd4867447956156ae3a602fcbac10a2c335", GitTreeState:"clean", BuildDate:"2025-04-22T16:02:27Z", GoVersion:"go1.23.6", Compiler:"gc", Platform:"linux/amd64"}
+   ```
+
+5. **Explanation:**
+   This script downloads version 1.32 of [`kubeadm`](https://v1-32.docs.kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/), the Kubernetes command-line tool for cluster management, along with kubectl and kubelet, on the current node.
+
+6. Repeat steps 1 to 3 on your other bare-metal server, which will serve as a worker node.
+
+### Step 2: Installing container runtime on each node
+
+1. Access to a bare-metal server that will serve as the control plane node.
+
+2. Execute the script [`install-cri-o.sh`](../utils/install-helm.sh):
+
+   ```bash
+   bash install-cri-o.sh
+   ```
+
+3. **Expected Output:**
+   - Successful installation of cri-o runtime.
+   - Verification message using:
+
+     ```bash
+     sudo systemctl status crio
+     ```
+
+   Example output:
+
+   ```plaintext
+   ● crio.service - Container Runtime Interface for OCI (CRI-O)
+      Loaded: loaded (/lib/systemd/system/crio.service; enabled; vendor preset: enabled)
+      Active: active (running) since Fri 2025-05-16 16:32:31 UTC; 20h ago
+         Docs: https://github.com/cri-o/cri-o
+      Main PID: 2332175 (crio)
+         Tasks: 61
+      Memory: 14.4G
+         CPU: 17min 55.486s
+      CGroup: /system.slice/crio.service
+   ```
+
+4. **Explanation:**
+   - Downloads, installs and configures v1.32 version of cri-o container runtime for your Kubernetes cluster.
+
+5. **Explanation:**
+   This script downloads v1.32 version of [`cri-0`](https://github.com/cri-o/packaging/blob/main/README.md#distributions-using-deb-packages), one of container runtimes for Kubernetes for managing pods on your cluster.
+
+6. Repeat steps 1 to 2 on your other bare-metal server, which will serve as a worker node.
+
+### Step 3: Setting up a control plane node
+
+1. Access to a bare-metal server that will serve as the control plane node.
+
+2. Execute the following command and wait for it to complete:
+
+   ```bash
+   # Look for a line starting with "default via"
+   # For example: default via 10.128.0.1 dev ens5
+   ip route show
+
+   # Or get your network interface's ip address using the following command:
+   export K8S_NET_IP=$(ip addr show dev $(ip route show | awk '/^default/ {print $5}') | awk '/inet / {print $2}' | cut -d/ -f1)
+   echo "K8S_NET_IP=${K8S_NET_IP}"
+
+   # On one of the nodes designated to become a control plane node, execute the following command:
+   sudo kubeadm init \
+      --cri-socket=unix:///var/run/crio/crio.sock \
+      --apiserver-advertise-address=${K8S_NET_IP} \
+      --pod-network-cidr=192.168.0.0/16
+   ```
+
+   Example output:
+
+   ```plaintext
+   # Your Kubernetes control-plane has initialized successfully!
+
+   # To start using your cluster, you need to run the following as a regular user:
+
+   #   mkdir -p $HOME/.kube
+   #   sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+   #   sudo chown $(id -u):$(id -g) $HOME/.kube/config
+
+   # Alternatively, if you are the root user, you can run:
+
+   #   export KUBECONFIG=/etc/kubernetes/admin.conf
+
+   # You should now deploy a pod network to the cluster.
+   # Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
+   #   https://kubernetes.io/docs/concepts/cluster-administration/addons/
+
+   # Then you can join any number of worker nodes by running the following on each as root:
+
+   # kubeadm join <YOUR_CONTROL_PLANE_NODE_IP> --token <YOUR_GENERATED_TOKEN> \
+   #         --discovery-token-ca-cert-hash <YOUR_GENERATED_CA_CERT_HASH>
+   ```
+
+   Perform following command to set your kube config:
+
+   ```bash
+   mkdir -p $HOME/.kube
+   sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+   sudo chown $(id -u):$(id -g) $HOME/.kube/config
+   ```
+
+   If your control plane node is equipped with GPUs and you want GPU-enabled pods to be scheduled on it, you must remove the default taint from the node:
+
+   ```bash
+   kubectl taint node instance-20250503-060921 node-role.kubernetes.io/control-plane-
+   ```
+
+3. **Expected Output:**
+   - Successful initialization of control plane node.
+   - Verification message using:
+
+     ```bash
+     kubectl get nodes -o wide
+     ```
+
+   Example output:
+
+   ```plaintext
+   NAME                       STATUS   ROLES           AGE   VERSION   INTERNAL-IP     EXTERNAL-IP   OS-IMAGE                         KERNEL-VERSION          CONTAINER-RUNTIME
+   instance-20250503-060921   Ready    control-plane   20h   v1.32.4   10.xxx.x.xx     <none>        Debian GNU/Linux 11 (bullseye)   5.10.0-33-cloud-amd64   cri-o://1.32.4
+   ```
+
+   Refer to [`official kubeadm documentation`](https://v1-32.docs.kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm/) for more information.
+
+### Step 4: Setting and joining a worker node
+
+1. Access to a bare-metal server that will serve as the worker node.
+
+2. Execute the following command and wait for it to complete:
+
+   ```bash
+   # You got following output from previous control node initialization:
+
+   # --------------------------------------------------------------------------------
+   # Your Kubernetes control-plane has initialized successfully!
+   #
+   # ...
+   #
+   # Then you can join any number of worker nodes by running the following on each as root:
+   #
+   # kubeadm join <YOUR_CONTROL_PLANE_NODE_IP> --token <YOUR_GENERATED_TOKEN> \
+   #         --discovery-token-ca-cert-hash sha256:<YOUR_GENERATED_CA_CERT_HASH>
+   # --------------------------------------------------------------------------------
+
+   # Execute the following command on your worker node:
+   sudo kubeadm join <YOUR_CONTROL_PLANE_NODE_IP>:6443 --token <YOUR_GENERATED_TOKEN> \
+            --discovery-token-ca-cert-hash sha256:<YOUR_GENERATED_CA_CERT_HASH> \
+            --cri-socket=unix:///var/run/crio/crio.sock
+   ```
+
+   If you lost above information, you can get the token and hash by running following command on your CONTROL PLANE node::
+
+   ```bash
+   # To get <YOUR_CONTROL_PLANE_NODE_IP>:
+   export K8S_NET_IP=$(ip addr show dev $(ip route show | awk '/^default/ {print $5}') | awk '/inet / {print $2}' | cut -d/ -f1)
+   echo "K8S_NET_IP=${K8S_NET_IP}"
+
+   # To get <YOUR_CONTROL_PLANE_NODE_IP>:
+   sudo kubeadm token create
+
+   # To get <YOUR_GENERATED_CA_CERT_HASH>:
+   openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | \
+   openssl rsa -pubin -outform der 2>/dev/null | \
+   sha256sum | awk '{print $1}'
+   ```
+
+   Example output:
+
+   ```plaintext
+   sudo kubeadm join <YOUR_CONTROL_PLANE_NODE_IP>:6443 --token <YOUR_CONTROL_PLANE_NODE_IP> --discovery-token-ca-cert-hash sha256:<YOUR_GENERATED_CA_CERT_HASH> --cri-socket=unix:///var/run/crio/crio.sock
+   [preflight] Running pre-flight checks
+   [preflight] Reading configuration from the "kubeadm-config" ConfigMap in namespace "kube-system"...
+   [preflight] Use 'kubeadm init phase upload-config --config your-config.yaml' to re-upload it.
+   [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
+   [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
+   [kubelet-start] Starting the kubelet
+   [kubelet-check] Waiting for a healthy kubelet at http://127.0.0.1:10248/healthz. This can take up to 4m0s
+   [kubelet-check] The kubelet is healthy after 500.795239ms
+   [kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap
+
+   This node has joined the cluster:
+   * Certificate signing request was sent to apiserver and a response was received.
+   * The Kubelet was informed of the new secure connection details.
+
+   Run 'kubectl get nodes' on the control-plane to see this node join the cluster.
+   ```
+
+   Copy kube config file from your control plane node to current worker node (with ssh or scp):
+
+   ```bash
+   mkdir -p $HOME/.kube
+   scp YOUR_SSH_ACCOUNT:$HOME/.kube/config $HOME/.kube/config
+   sudo chown $(id -u):$(id -g) $HOME/.kube/config
+   ```
+
+3. **Expected Output:**
+   - Successful initialization of worker node.
+   - Verification message using:
+
+     ```bash
+     kubectl get nodes -o wide
+     ```
+
+   Example output:
+
+   ```plaintext
+   NAME                       STATUS   ROLES           AGE   VERSION   INTERNAL-IP     EXTERNAL-IP   OS-IMAGE                         KERNEL-VERSION          CONTAINER-RUNTIME
+   instance-20250503-060921   Ready    control-plane   20h   v1.32.4   10.xxx.x.xxx     <none>        Debian GNU/Linux 11 (bullseye)   5.10.0-33-cloud-amd64   cri-o://1.32.4
+   insudevmachine             Ready    <none>          14m   v1.32.4   10.yyy.y.yyy   <none>        Debian GNU/Linux 11 (bullseye)   5.10.0-33-cloud-amd64   cri-o://1.32.4
+   ```
+
+   Refer to [`official kubeadm documentation`](https://kubernetes.io/docs/reference/setup-tools/kubeadm/kubeadm-join/) for more information.
+
+### Step 5: Installing container network interface
+
+1. Access to a bare-metal server that will serve as the control plane node.
+
+2. Clone the repository and navigate to the [`utils/`](../utils/) folder:
+
+   ```bash
+   git clone https://github.com/vllm-project/production-stack.git
+   cd production-stack/utils
+   ```
+
+3. Execute the script [`install-calico.sh`](../utils/install-calico.sh):
+
+   ```bash
+   bash install-calico.sh
+   ```
+
+4. **Expected Output:**
+   - Confirmation that the `Tigera` operator and its associated custom resources have been successfully installed.
+   - Verification message using:
+
+     ```bash
+     kubectl get pods -o wide
+     ```
+
+   Example output:
+
+   ```plaintext
+   NAMESPACE          NAME                                                          READY   STATUS      RESTARTS      AGE   IP                NODE                       NOMINATED NODE   READINESS
+   GATES
+   calico-apiserver   calico-apiserver-cccf4bb9f-8lbc7                              1/1     Running     0             21h   192.168.190.7     instance-20250503-060921   <none>           <none>
+   calico-apiserver   calico-apiserver-cccf4bb9f-knn9c                              1/1     Running     0             21h   192.168.190.4     instance-20250503-060921   <none>           <none>
+   calico-system      calico-kube-controllers-56dfdbb787-c24gd                      1/1     Running     0             21h   192.168.190.2     instance-20250503-060921   <none>           <none>
+   calico-system      calico-node-dtbcq                                             1/1     Running     0             21h   10.xxx.xxx.xxx       instance-20250503-060921   <none>           <none>
+   calico-system      calico-node-jptsp                                             1/1     Running     0             33m   10.xxx.xxx.xxx      insudevmachine             <none>           <none>
+   calico-system      calico-typha-b7d75bc58-h6vrb                                  1/1     Running     0             37m   10.xxx.xxx.xxx        instance-20250503-060921   <none>           <none>
+   calico-system      csi-node-driver-884sn                                         2/2     Running     0             26m   192.168.165.193   insudevmachine             <none>           <none>
+   calico-system      csi-node-driver-bb7dl                                         2/2     Running     0             21h   192.168.190.1     instance-20250503-060921   <none>           <none>
+   calico-system      goldmane-7b5b4cd5d9-6bk5p                                     1/1     Running     0             21h   192.168.190.6     instance-20250503-060921   <none>           <none>
+   calico-system      whisker-5dbf545674-hnkpz                                      2/2     Running     0             21h   192.168.190.8     instance-20250503-060921   <none>           <none>
+   ...
+   kube-system        coredns-668d6bf9bc-5hvx7                                      1/1     Running     0             21h   192.168.190.3     instance-20250503-060921   <none>           <none>
+   kube-system        coredns-668d6bf9bc-wb7qq                                      1/1     Running     0             21h   192.168.190.5     instance-20250503-060921   <none>           <none>
+   ```
+
+   Ensure that the status of each node is marked as “Ready” and that the CoreDNS pods are running:
+
+   ```bash
+   kubectl get nodes
+
+   # NAME                       STATUS   ROLES           AGE   VERSION
+   # instance-20250503-060921   Ready    control-plane   21h   v1.32.4
+   # insudevmachine             Ready    <none>          37m   v1.32.4
+
+   kubectl get pods -n kube-system | grep -i coredns
+
+   # coredns-668d6bf9bc-5hvx7                           1/1     Running   0          21h
+   # coredns-668d6bf9bc-wb7qq                           1/1     Running   0          21h
+   ```
+
+5. **Explanation:**
+   This script downloads version 3.30.0 of [`calico`](https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart), a container network interface (CNI) plugin for Kubernetes clusters.
+
+### Step 6: Installing nvidia device plugin
+
+1. Access to a bare-metal server that will serve as the control plane node.
+
+2. Clone the repository and navigate to the [`utils/`](../utils/) folder:
+
+   ```bash
+   git clone https://github.com/vllm-project/production-stack.git
+   cd production-stack/utils
+   ```
+
+3. Execute the script [`init-nvidia-gpu-setup-k8s.sh`](../utils/init-nvidia-gpu-setup-k8s.sh):
+
+   ```bash
+   bash init-nvidia-gpu-setup-k8s.sh
+   ```
+
+4. **Explanation:**
+   - Configures the system to support GPU workloads by enabling the NVIDIA Container Toolkit and starting Minikube with GPU support.
+   - Installs the NVIDIA `gpu-operator` chart to manage GPU resources within the cluster.
+
+5. **Expected Output:**
+   If everything goes smoothly, you should see the example output like following:
+
+   ```plaintext
+   ...
+   NAME: gpu-operator-1737507918
+   LAST DEPLOYED: Wed Jan 22 01:05:21 2025
+   NAMESPACE: gpu-operator
+   STATUS: deployed
+   REVISION: 1
+   TEST SUITE: None
+   ```
+
+6. Some troubleshooting tips for installing gpu-operator:
+
+   If gpu-operator fails to start because of the common seen “too many open files” issue for minikube (and [kind](https://kind.sigs.k8s.io/)), then a quick fix below may be helpful.
+
+   The issue can be observed by one or more gpu-operator pods in `CrashLoopBackOff` status, and be confirmed by checking their logs. For example,
+
+   ```console
+   $ kubectl -n gpu-operator logs daemonset/nvidia-device-plugin-daemonset -c nvidia-device-plugin
+   IS_HOST_DRIVER=true
+   NVIDIA_DRIVER_ROOT=/
+   DRIVER_ROOT_CTR_PATH=/host
+   NVIDIA_DEV_ROOT=/
+   DEV_ROOT_CTR_PATH=/host
+   Starting nvidia-device-plugin
+   I0131 19:35:42.895845       1 main.go:235] "Starting NVIDIA Device Plugin" version=<
+      d475b2cf
+      commit: d475b2cfcf12b983a4975d4fc59d91af432cf28e
+   >
+   I0131 19:35:42.895917       1 main.go:238] Starting FS watcher for /var/lib/kubelet/device-plugins
+   E0131 19:35:42.895933       1 main.go:173] failed to create FS watcher for /var/lib/kubelet/device-plugins/: too many open files
+   ```
+
+   The fix is [well documented](https://kind.sigs.k8s.io/docs/user/known-issues#pod-errors-due-to-too-many-open-files) by kind, it also works for minikube.
+
+## Conclusion
+
+By completing this tutorial, you have successfully established a multi-node Kubernetes environment with GPU support on your servers. You are now prepared to deploy and test the vLLM Production Stack within this Kubernetes cluster. For additional configuration and workload-specific guidance, please refer to the official documentation for `kubectl`, `helm`, and `minikube`.
+
+What's next:
+
+- [00-b-install-kuberay-operator](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-b-install-kuberay-operator.md)
diff --git a/tutorials/00-b-install-kuberay-operator.md b/tutorials/00-b-install-kuberay-operator.md
new file mode 100644
index 000000000..470521e1c
--- /dev/null
+++ b/tutorials/00-b-install-kuberay-operator.md
@@ -0,0 +1,85 @@
+# Tutorial: Setting Up a Kuberay Operator on Your Kubernetes Environment
+
+## Introduction
+
+This tutorial provides a step-by-step guide to installing and configuring the KubeRay operator within a Kubernetes environment. We will use the helm chart to set up kuberay, enabling distributed inference with vLLM. By the end of this tutorial, you will have a fully operational KubeRay operator ready to support the deployment of the vLLM Production Stack.
+
+## Table of Contents
+
+- [Introduction](#introduction)
+- [Table of Contents](#table-of-contents)
+- [Prerequisites](#prerequisites)
+- [Steps](#steps)
+  - [Step 1: Install the KubeRay Operator Using Helm](#step-1-install-the-kuberay-operator-using-helm)
+  - [Step 2: Verify the KubeRay Configuration](#step-2-verify-the-kuberay-configuration)
+
+## Prerequisites
+
+Before you begin, ensure the following:
+
+1. **GPU Server Requirements:**
+   - A server with a GPU and drivers properly installed (e.g., NVIDIA drivers).
+   - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed for GPU workloads.
+
+2. **Access and Permissions:**
+   - Root or administrative access to the server.
+   - Internet connectivity to download required packages and tools.
+
+3. **Environment Setup:**
+   - A Linux-based operating system (e.g., Ubuntu 20.04 or later).
+   - Basic understanding of Linux shell commands.
+
+4. **Kubernetes Installation:**
+   - To quickly and easily set up a single-node Kubernetes environment, you may install Minikube by following the instructions provided in[`00-install-kubernetes-env.md`](00-install-kubernetes-env.md).
+   - For setting up a multi-node cluster or a more generalized Kubernetes environment, you may install Kubernetes from scratch using Kubeadm. This involves configuring the container runtime and container network interface (CNI), as outlined in [`00-a-install-multinode-kubernetes-env.md`](00-a-install-multinode-kubernetes-env.md)
+   - If you already have a running Kubernetes cluster, you may skip this step.
+
+5. **Kuberay Concept Review:**
+   - Review the [`official KubeRay documentation`](https://docs.ray.io/en/latest/cluster/kubernetes/index.html) for additional context and best practices.
+
+## Steps
+
+### Step 1: Install the KubeRay Operator Using Helm
+
+1. Add the KubeRay Helm repository:
+
+   ```bash
+   helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+   helm repo update
+   ```
+
+2. Install the Custom Resource Definitions (CRDs) and the KubeRay operator (version 1.2.0) in the default namespace:
+
+   ```bash
+   helm install kuberay-operator kuberay/kuberay-operator --version 1.2.0
+   ```
+
+3. **Explanation:**
+   This step deploys the stable KubeRay operator in your Kubernetes cluster. The operator is essential for managing Ray clusters and enables you to scale multiple vLLM instances for distributed inference workloads.
+
+### Step 2: Verify the KubeRay Configuration
+
+1. **Check the Operator Pod Status:**
+   - Ensure that the KubeRay operator pod is running in the default namespace:
+
+     ```bash
+     kubectl get pods
+     ```
+
+2. **Expected Output:**
+   Example output:
+
+   ```plaintext
+   NAME                                          READY   STATUS    RESTARTS   AGE
+   kuberay-operator-975995b7d-75jqd              1/1     Running   0          25h
+   ```
+
+## Conclusion
+
+You have now successfully installed and verified the KubeRay operator in your Kubernetes environment. This setup lays the foundation for deploying and managing the vLLM Production Stack for distributed inference or training workloads.
+
+For advanced configurations and workload-specific tuning, refer to the official documentation for kuberay, kubectl, helm, and minikube.
+
+What's next:
+
+- [15-basic-pipeline-parallel](https://github.com/vllm-project/production-stack/blob/main/tutorials/15-basic-pipeline-parallel.md)
diff --git a/tutorials/15-basic-pipeline-parallel.md b/tutorials/15-basic-pipeline-parallel.md
new file mode 100644
index 000000000..2d5b6d0b0
--- /dev/null
+++ b/tutorials/15-basic-pipeline-parallel.md
@@ -0,0 +1,309 @@
+# Tutorial: Basic vLLM Configurations
+
+## Introduction
+
+This tutorial provides a step-by-step guide for configuring and deploying the vLLM serving engine on a multi-node Kubernetes cluster with support for distributed inference using KubeRay. It also explains how to launch the vLLM serving engine with pipeline parallelism enabled.
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Step 1: Basic explanation of Ray and Kuberay](#step-1-basic-explanation-of-ray-and-kuberay)
+3. [Step 2: Preparing the Configuration File](#step-2-preparing-the-configuration-file)
+4. [Step 3: Applying the Configuration](#step-3-applying-the-configuration)
+5. [Step 4: Verifying the Ray Cluster](#step-4-verifying-the-deployment)
+
+## Prerequisites
+
+- A Kubernetes cluster with multiple nodes with GPU support, as set up in the [00-a-install-multinode-kubernetes-env tutorial](00-a-install-multinode-kubernetes-env.md).
+- Install kuberay operator on the Kubernetes environment with [00-b-install-kuberay-operator tutorial](00-b-install-kuberay-operator.md).
+- Helm installed on your system.
+- Access to a HuggingFace token (`HF_TOKEN`).
+- A basic understanding of Ray is recommended. For more information, refer to the [official ray documentation](https://docs.ray.io/en/latest/cluster/kubernetes/index.html).
+
+## Step 1: Basic explanation of Ray and Kuberay
+
+1. Ray is a framework designed for distributed workloads, such as distributed training and inference. It operates by running multiple processes—typically containers or pods—to distribute and synchronize tasks efficiently.
+
+2. Ray organizes these processes into a Ray cluster, which consists of a single head node and multiple worker nodes. The term "node" here refers to a logical process, which can be deployed as a container or pod.
+
+3. KubeRay is a Kubernetes operator that simplifies the creation and management of Ray clusters within a Kubernetes environment. Without KubeRay, setting up Ray nodes requires manual configuration.
+
+4. Using KubeRay, you can easily deploy Ray clusters on Kubernetes. These clusters enable distributed inference with vLLM, supporting both tensor parallelism and pipeline parallelism.
+
+## Step 2: Preparing the Configuration File
+
+1. Locate the example configuration file [`tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml`](assets/values-15-minimal-pipeline-parallel-example.yaml).
+
+2. Open the file and update the following fields:
+
+- Write your actual huggingface token in `hf_token: <YOUR HF TOKEN>` in the yaml file.
+
+### Explanation of Key Items in `values-15-minimal-pipeline-parallel-example.yaml`
+
+- **`raySpec`**: Required when using KubeRay to enable pipeline parallelism.
+- **`headNode`**: Specifies the resource requirements for the Kuberay head node and must be defined accordingly:
+  - **`requestCPU`**: The amount of CPU resources requested for Kuberay head pod.
+  - **`requestMemory`**: Memory allocation for Kuberay head pod. Sufficient memory is required to load the model.
+  - **`requestGPU`**: Defines the number of GPUs to allocate for the KubeRay head pod. Currently, the Ray head node must also participate in both tensor parallelism and pipeline parallelism. This requirement exists because the `vllm serve ...` command is executed on the Ray head node, and vLLM mandates that the pod where this command is run must have at least one visible GPU.
+- **`name`**: The unique identifier for your model deployment.
+- **`repository`**: The Docker repository containing the model's serving engine image.
+- **`tag`**: Specifies the version of the model image to use.
+- **`modelURL`**: The URL pointing to the model on Hugging Face or another hosting service.
+- **`replicaCount`**: The number of total Kuberay worker pods.
+- **`requestCPU`**: The amount of CPU resources requested per Kuberay worker pod.
+- **`requestMemory`**: Memory allocation for each Kuberay worker pod. Sufficient memory is required to load the model.
+- **`requestGPU`**: Specifies the number of GPUs to allocate for each Kuberay worker pod.
+- **`vllmConfig`**: Contains model-specific configurations:
+  - `tensorParallelSize`: Specifies the number of GPUs assigned to each worker pod. This value must be identical to both `requestGPU` and `raySpec.headNode.requestGPU`.
+  - `pipelineParallelSize`: Indicates the level of pipeline parallelism. This value must be equal to `replicaCount + 1`, representing the total number of Ray cluster nodes, including both head and worker nodes.
+  - **Important Note:**
+    - The total number of GPUs required is computed as `pipelineParallelSize × tensorParallelSize`.
+    - This total must exactly match the sum of:
+      - `replicaCount × requestGPU` (the total number of GPUs allocated to Ray worker nodes), and
+      - `raySpec.headNode.requestGPU` (the number of GPUs allocated to the Ray head node).
+    - The `requestGPU` value for the Ray head node must be identical to that of each worker node.
+    - `tensorParallelSize` defines the number of GPUs allocated per Ray node (including both head and worker nodes), and must be consistent across all nodes.
+    - `pipelineParallelSize` represents the total number of Ray nodes, and must therefore be set to replicaCount + 1 (i.e., the number of worker nodes plus the head node).
+- **`shmSize`**: Configures the shared memory size to ensure adequate memory is available for inter-process communication during tensor and pipeline parallelism execution.
+- **`hf_token`**: The Hugging Face token for authenticating with the Hugging Face model hub.
+
+### Example Snippet
+
+In the following example, we configure a total of two Ray nodes each equipped with two GPUs (one head node and one worker node) to serve a distilgpt2 model. We set the tensor parallelism size to 2, as each node contains two GPUs, and the pipeline parallelism size to 2, corresponding to the two Ray nodes being utilized.
+
+```yaml
+servingEngineSpec:
+  runtimeClassName: ""
+  raySpec:
+    headNode:
+      requestCPU: 2
+      requestMemory: "20Gi"
+      requestGPU: 2
+  modelSpec:
+  - name: "distilgpt2"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "distilbert/distilgpt2"
+
+    replicaCount: 1
+
+    requestCPU: 2
+    requestMemory: "20Gi"
+    requestGPU: 2
+
+    vllmConfig:
+      tensorParallelSize: 2
+      pipelineParallelSize: 2
+
+    shmSize: "20Gi"
+
+    hf_token: <YOUR HF TOKEN>
+```
+
+## Step 3: Applying the Configuration
+
+Deploy the configuration using Helm:
+
+```bash
+helm repo add vllm https://vllm-project.github.io/production-stack
+helm install vllm vllm/vllm-stack -f tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml
+```
+
+Expected output:
+
+You should see output indicating the successful deployment of the Helm chart:
+
+```plaintext
+NAME: vllm
+LAST DEPLOYED: Sun May 11 15:10:34 2025
+NAMESPACE: default
+STATUS: deployed
+REVISION: 1
+TEST SUITE: None
+```
+
+## Step 4: Verifying the Deployment
+
+1. Check the status of the pods:
+
+   ```bash
+   kubectl wait --for=condition=ready pod -l environment=router,release=router --namespace=default --timeout=60s && \
+   kubectl get pods
+   ```
+
+   Expected output:
+
+   You should see the following pods:
+
+   ```plaintext
+   pod/vllm-deployment-router-8666bf6464-v97v8 condition met
+   NAME                                          READY   STATUS    RESTARTS   AGE   IP                NODE                       NOMINATED NODE   READINESS GATES
+   kuberay-operator-f89ddb644-858bw              1/1     Running   0          12h   192.168.165.203   insudevmachine             <none>           <none>
+   vllm-deployment-router-8666bf6464-v97v8       1/1     Running   0          12h   192.168.165.206   insudevmachine             <none>           <none>
+   vllm-distilgpt2-raycluster-head-wvqj5         1/1     Running   0          12h   192.168.190.20    instance-20250503-060921   <none>           <none>
+   vllm-distilgpt2-raycluster-ray-worker-fdvnh   1/1     Running   0          12h   192.168.165.207   insudevmachine             <none>           <none>
+   ```
+
+   - In this example, the production stack is deployed in a Kubernetes environment consisting of two nodes, each equipped with two GPUs.
+
+   - The Ray head and worker nodes are scheduled on separate nodes. A total of four GPUs are utilized, with each node contributing two GPUs.
+
+   - The vllm-deployment-router pod functions as the request router, directing incoming traffic to the appropriate model-serving pod.
+
+   - The vllm-distilgpt2-raycluster-head pod is responsible for running the primary vLLM command.
+
+   - The vllm-distilgpt2-raycluster-ray-worker-* pods serve the model and handle inference requests.
+
+2. Verify the service is exposed correctly:
+
+   ```bash
+   kubectl get services
+   ```
+
+   Expected output:
+
+   Ensure there are services for both the serving engine and the router:
+
+   ```plaintext
+   NAME                                  TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)             AGE
+   kuberay-operator                      ClusterIP   10.97.0.153      <none>        8080/TCP            13h
+   kubernetes                            ClusterIP   10.96.0.1        <none>        443/TCP             13h
+   vllm-distilgpt2-engine-service        ClusterIP   10.106.237.111   <none>        80/TCP              12h
+   vllm-distilgpt2-raycluster-head-svc   ClusterIP   None             <none>        8000/TCP,8080/TCP   12h
+   vllm-router-service                   ClusterIP   10.97.229.184    <none>        80/TCP              12h
+   ```
+
+   - The `vllm-*-engine-service` exposes the head node of the ray cluster.
+   - The `vllm-*-router-service` handles routing and load balancing across model-serving pods.
+
+3. Test the health endpoint:
+
+   To verify that the service is operational, execute the following commands:
+
+   ```bash
+   kubectl port-forward svc/vllm-router-service 30080:80
+   curl http://localhost:30080/v1/models
+   ```
+
+   **Note:** Port forwarding must be performed from a separate shell session. If the deployment is configured correctly, you should receive a response similar to the following:
+
+   ```plaintext
+   {
+       "object": "list",
+       "data": [
+           {
+               "id": "distilbert/distilgpt2",
+               "object": "model",
+               "created": 1747465656,
+               "owned_by": "vllm",
+               "root": null
+           }
+       ]
+   }
+   ```
+
+   You may also perform a basic inference test to validate that pipeline parallelism is functioning as expected. Use the following curl command:
+
+   ```bash
+   curl -X POST http://localhost:30080/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+      "model": "distilbert/distilgpt2",
+      "prompt": "Once upon a time,",
+      "max_tokens": 10
+    }'
+   ```
+
+   A successful response should resemble the following output:
+
+   ```plaintext
+   {
+       "id": "cmpl-92c4ceef0f1c42c9bba10da8306bf86c",
+       "object": "text_completion",
+       "created": 1747465724,
+       "model": "distilbert/distilgpt2",
+       "choices": [
+           {
+               "index": 0,
+               "text": "? Huh, are you all red?\n\n",
+               "logprobs": null,
+               "finish_reason": "length",
+               "stop_reason": null,
+               "prompt_logprobs": null
+           }
+       ],
+       "usage": {
+           "prompt_tokens": 5,
+           "total_tokens": 15,
+           "completion_tokens": 10,
+           "prompt_tokens_details": null
+       }
+   }
+   ```
+
+   You can also monitor GPU usage for each Ray head and worker pod:
+
+    ```plaintext
+   kubectl exec -it vllm-distilgpt2-raycluster-head-wvqj5 -- /bin/bash
+   root@vllm-distilgpt2-raycluster-head-wvqj5:/vllm-workspace# nvidia-smi
+   Sat May 17 00:10:48 2025
+   +-----------------------------------------------------------------------------------------+
+   | NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
+   |-----------------------------------------+------------------------+----------------------+
+   | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+   | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+   |                                         |                        |               MIG M. |
+   |=========================================+========================+======================|
+   |   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
+   | N/A   76C    P0             35W /   72W |   20313MiB /  23034MiB |      0%      Default |
+   |                                         |                        |                  N/A |
+   +-----------------------------------------+------------------------+----------------------+
+   |   1  NVIDIA L4                      Off |   00000000:00:04.0 Off |                    0 |
+   | N/A   70C    P0             33W /   72W |   20305MiB /  23034MiB |      0%      Default |
+   |                                         |                        |                  N/A |
+   +-----------------------------------------+------------------------+----------------------+
+
+   +-----------------------------------------------------------------------------------------+
+   | Processes:                                                                              |
+   |  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+   |        ID   ID                                                               Usage      |
+   |=========================================================================================|
+   |    0   N/A  N/A         8      C   /usr/bin/python3                                0MiB |
+   |    1   N/A  N/A      1082      C   ray::RayWorkerWrapper                           0MiB |
+   +-----------------------------------------------------------------------------------------+
+
+   ###########################################################################################
+
+   kubectl exec -it vllm-distilgpt2-raycluster-ray-worker-fdvnh -- /bin/bash
+   Defaulted container "vllm-ray-worker" out of: vllm-ray-worker, wait-gcs-ready (init)
+   root@vllm-distilgpt2-raycluster-ray-worker-fdvnh:/vllm-workspace# nvidia-smi
+   Sat May 17 00:12:06 2025
+   +-----------------------------------------------------------------------------------------+
+   | NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
+   |-----------------------------------------+------------------------+----------------------+
+   | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+   | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+   |                                         |                        |               MIG M. |
+   |=========================================+========================+======================|
+   |   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
+   | N/A   76C    P0             40W /   72W |   20065MiB /  23034MiB |      0%      Default |
+   |                                         |                        |                  N/A |
+   +-----------------------------------------+------------------------+----------------------+
+   |   1  NVIDIA L4                      Off |   00000000:00:04.0 Off |                    0 |
+   | N/A   72C    P0             38W /   72W |   20063MiB /  23034MiB |      0%      Default |
+   |                                         |                        |                  N/A |
+   +-----------------------------------------+------------------------+----------------------+
+
+   +-----------------------------------------------------------------------------------------+
+   | Processes:                                                                              |
+   |  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+   |        ID   ID                                                               Usage      |
+   |=========================================================================================|
+   |    0   N/A  N/A       243      C   ray::RayWorkerWrapper                           0MiB |
+   |    1   N/A  N/A       244      C   ray::RayWorkerWrapper                           0MiB |
+   +-----------------------------------------------------------------------------------------+
+   ```
+
+## Conclusion
+
+In this tutorial, you configured and deployed the vLLM serving engine with support for pipeline parallelism across multiple GPUs within a multi-node Kubernetes environment using KubeRay. Additionally, you learned how to verify the deployment and monitor the associated pods to ensure proper operation. For further customization and configuration options, please consult the `values.yaml` file and the Helm chart documentation.
diff --git a/tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml b/tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml
new file mode 100644
index 000000000..62dbf8d54
--- /dev/null
+++ b/tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml
@@ -0,0 +1,24 @@
+servingEngineSpec:
+  runtimeClassName: ""
+  raySpec:
+    headNode:
+      requestCPU: 2
+      requestMemory: "20Gi"
+      requestGPU: 2
+  modelSpec:
+  - name: "distilgpt2"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "distilbert/distilgpt2"
+
+    replicaCount: 1
+
+    requestCPU: 2
+    requestMemory: "20Gi"
+    requestGPU: 2
+
+    vllmConfig:
+      tensorParallelSize: 2
+      pipelineParallelSize: 2
+
+    shmSize: "20Gi"
diff --git a/utils/init-nvidia-gpu-setup-k8s.sh b/utils/init-nvidia-gpu-setup-k8s.sh
new file mode 100755
index 000000000..c49d4144e
--- /dev/null
+++ b/utils/init-nvidia-gpu-setup-k8s.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+set -e
+
+# Allow users to override the paths for the NVIDIA tools.
+: "${NVIDIA_SMI_PATH:=nvidia-smi}"
+: "${NVIDIA_CTK_PATH:=nvidia-ctk}"
+
+# --- Debug and Environment Setup ---
+echo "Current PATH: $PATH"
+echo "Operating System: $(uname -a)"
+
+# Get the script directory to reference local scripts reliably.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# --- Install Prerequisites ---
+echo "Installing kubectl and helm..."
+bash "$SCRIPT_DIR/install-kubectl.sh"
+bash "$SCRIPT_DIR/install-helm.sh"
+
+# --- Configure BPF (if available) ---
+if [ -f /proc/sys/net/core/bpf_jit_harden ]; then
+    echo "Configuring BPF: Setting net.core.bpf_jit_harden=0"
+    if ! grep -q "net.core.bpf_jit_harden=0" /etc/sysctl.conf; then
+        echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf
+    fi
+    sudo sysctl -p
+else
+    echo "BPF JIT hardening configuration not available, skipping..."
+fi
+
+# --- NVIDIA GPU Setup ---
+GPU_AVAILABLE=false
+if command -v "$NVIDIA_SMI_PATH" >/dev/null 2>&1; then
+    echo "NVIDIA GPU detected via nvidia-smi at: $(command -v "$NVIDIA_SMI_PATH")"
+    if command -v "$NVIDIA_CTK_PATH" >/dev/null 2>&1; then
+      echo "nvidia-ctk found at: $(command -v "$NVIDIA_CTK_PATH")"
+      GPU_AVAILABLE=true
+    else
+      echo "nvidia-ctk not found. Please install the NVIDIA Container Toolkit to enable GPU support."
+    fi
+fi
+
+if [ "$GPU_AVAILABLE" = true ]; then
+    # Configure Docker for GPU support.
+    echo "Configuring Docker runtime for GPU support..."
+    if sudo "$NVIDIA_CTK_PATH" runtime configure --runtime=docker; then
+      echo "Restarting Docker to apply changes..."
+      echo "WARNING: Restarting Docker will stop and restart all containers."
+      sudo systemctl restart docker
+      echo "Docker runtime configured successfully."
+    else
+      echo "Error: Failed to configure Docker runtime using the NVIDIA Container Toolkit."
+      exit 1
+    fi
+
+    # Install the GPU Operator via Helm.
+    echo "Adding NVIDIA helm repo and updating..."
+    helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
+    echo "Installing GPU Operator..."
+    helm install --wait gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator --version=v24.9.1
+fi
+
+echo "NVIDIA GPU Setup complete."
diff --git a/utils/install-calico.sh b/utils/install-calico.sh
new file mode 100755
index 000000000..e8bf8608c
--- /dev/null
+++ b/utils/install-calico.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Refer to https://docs.tigera.io/calico/latest/getting-started/kubernetes/quickstart
+# for more information.
+
+# Install the Tigera operator and custom resource definitions:
+kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.30.0/manifests/tigera-operator.yaml
+
+# Install Calico by creating the necessary custom resources:
+kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.30.0/manifests/custom-resources.yaml
diff --git a/utils/install-cri-o.sh b/utils/install-cri-o.sh
new file mode 100755
index 000000000..87c7a7c7f
--- /dev/null
+++ b/utils/install-cri-o.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Refer to https://github.com/cri-o/packaging/blob/main/README.md#distributions-using-deb-packages
+# and
+# https://github.com/cri-o/cri-o/blob/main/contrib/cni/README.md#configuration-directory
+# for more information.
+
+# Install the dependencies for adding repositories
+sudo apt-get update
+sudo apt-get install -y software-properties-common curl
+
+export CRIO_VERSION=v1.32
+
+# Add the CRI-O repository
+curl -fsSL https://download.opensuse.org/repositories/isv:/cri-o:/stable:/$CRIO_VERSION/deb/Release.key |
+    sudo gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
+
+echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://download.opensuse.org/repositories/isv:/cri-o:/stable:/$CRIO_VERSION/deb/ /" |
+    sudo tee /etc/apt/sources.list.d/cri-o.list
+
+# Install the packages
+sudo apt-get update
+sudo apt-get install -y cri-o
+
+# Update crio config by creating (or editing) /etc/crio/crio.conf
+sudo tee /etc/crio/crio.conf > /dev/null <<EOF
+[crio.image]
+pause_image="registry.k8s.io/pause:3.10"
+
+[crio.runtime]
+conmon_cgroup = "pod"
+cgroup_manager = "systemd"
+EOF
+
+# Start CRI-O
+sudo systemctl start crio.service
+
+sudo swapoff -a
+sudo modprobe br_netfilter
+sudo sysctl -w net.ipv4.ip_forward=1
+
+# Apply sysctl params without reboot
+sudo sysctl --system
+
+# Verify that net.ipv4.ip_forward is set to 1 with:
+sudo sysctl net.ipv4.ip_forward
diff --git a/utils/install-kubeadm.sh b/utils/install-kubeadm.sh
new file mode 100755
index 000000000..1dfdeb15d
--- /dev/null
+++ b/utils/install-kubeadm.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Refer to https://v1-32.docs.kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/
+# for more detailed explanation of kubeadm installation.
+# Following instructions are for linux distributions like Ubuntu, Debian, etc.
+# This script is from above official documentation, but modified to work with Debian 11 (bullseye).
+
+sudo apt-get update
+# apt-transport-https may be a dummy package; if so, you can skip that package
+sudo apt-get install -y apt-transport-https ca-certificates curl gpg
+
+# If the directory `/etc/apt/keyrings` does not exist, it should be created before the curl command, read the note below.
+sudo mkdir -p -m 755 /etc/apt/keyrings
+curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.32/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+
+# This overwrites any existing configuration in /etc/apt/sources.list.d/kubernetes.list
+echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.32/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
+
+# Update the apt package index, install kubelet, kubeadm and kubectl, and pin their version:
+sudo apt-get update
+sudo apt-get install -y kubelet kubeadm kubectl
+sudo apt-mark hold kubelet kubeadm kubectl
+
+# (Optional) Enable the kubelet service before running kubeadm:
+sudo systemctl enable --now kubelet
diff --git a/utils/install-kuberay.sh b/utils/install-kuberay.sh
new file mode 100755
index 000000000..40b27567a
--- /dev/null
+++ b/utils/install-kuberay.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# original kuberay installation reference: https://github.com/ray-project/kuberay?tab=readme-ov-file#helm-charts
+
+# Add the Helm repo
+helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+helm repo update
+
+# Confirm the repo exists
+helm search repo kuberay --devel
+
+# Install both CRDs and KubeRay operator v1.2.0.
+helm install kuberay-operator kuberay/kuberay-operator --version 1.2.0
+
+# Check the KubeRay operator Pod in `default` namespace
+kubectl get pods
+# NAME                                READY   STATUS    RESTARTS   AGE
+# kuberay-operator-f89ddb644-psts7    1/1     Running   0          33m