GoogleCloudPlatform · ryanaoleary · Sep 4, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml
@@ -0,0 +1,96 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 1
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml
@@ -0,0 +1,96 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: vllm-tpu
+spec:
+  headGroupSpec:
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 5Gi
+                memory: 40G
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+  workerGroupSpecs:
+  - groupName: tpu-group
+    replicas: 1
+    minReplicas: 0
+    maxReplicas: 1
+    numOfHosts: 2
+    rayStartParams: {}
+    template:
+      spec:
+        containers:
+          - name: ray-worker
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+              requests:
+                cpu: "100"
+                google.com/tpu: "4"
+                ephemeral-storage: 50Gi
+                memory: 100G
+            env:
+              - name: JAX_PLATFORMS
+                value: "tpu"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-secret
+                    key: hf_api_token
+              - name: MODEL_ID
+                value: "meta-llama/Meta-Llama-3-8B-Instruct"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+          cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+            TPU_CHIPS: "8"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                memory: 40G
+              requests:
+                cpu: "8"
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+            cloud.google.com/gke-tpu-topology: 2x2x2
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4]
diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml
@@ -0,0 +1,100 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu
+spec:
+  serveConfigV2: |
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model
+        deployments:
+        - name: VLLMDeployment
+          num_replicas: 1
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+            TPU_CHIPS: "8"
+  rayClusterConfig:
+    rayVersion: 2.34.0
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: $VLLM_IMAGE
+            imagePullPolicy: IfNotPresent
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-secret
+                  key: hf_api_token
+            resources:
+              limits:
+                cpu: "8"
+                memory: 40G
+              requests:
+                cpu: "8"
+                memory: 40G
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 0
+      maxReplicas: 2
+      numOfHosts: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+            - name: ray-worker
+              image: $VLLM_IMAGE
+              imagePullPolicy: IfNotPresent
+              resources:
+                limits:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+                requests:
+                  cpu: "100"
+                  google.com/tpu: "4"
+                  ephemeral-storage: 50Gi
+                  memory: 100G
+              env:
+                - name: JAX_PLATFORMS
+                  value: "tpu"
+                - name: HUGGING_FACE_HUB_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+            cloud.google.com/gke-tpu-topology: 2x4
+# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e]