Restructured and standardized READMEs

GoogleCloudPlatform · Nov 8, 2024 · 7e283fc · 7e283fc
1 parent 38f0b4e
commit 7e283fc
Show file tree

Hide file tree

Showing 34 changed files with 1,327 additions and 493 deletions.
diff --git a/...rving-with-vllm/batch-inference/README.md → ...ses/inferencing/batch-inference/README.md b/...rving-with-vllm/batch-inference/README.md → ...ses/inferencing/batch-inference/README.md
diff --git a/...m/batch-inference/example_predictions.txt → ...g/batch-inference/example_predictions.txt b/...m/batch-inference/example_predictions.txt → ...g/batch-inference/example_predictions.txt
diff --git a/...-inference/manifests/batch_inference.yaml → ...-inference/manifests/batch_inference.yaml b/...-inference/manifests/batch_inference.yaml → ...-inference/manifests/batch_inference.yaml
diff --git a/...-with-vllm/batch-inference/src/Dockerfile → ...nferencing/batch-inference/src/Dockerfile b/...-with-vllm/batch-inference/src/Dockerfile → ...nferencing/batch-inference/src/Dockerfile
diff --git a/...-vllm/batch-inference/src/cloudbuild.yaml → ...ncing/batch-inference/src/cloudbuild.yaml b/...-vllm/batch-inference/src/cloudbuild.yaml → ...ncing/batch-inference/src/cloudbuild.yaml
diff --git a/...ch-inference/src/custom_json_formatter.py → ...ch-inference/src/custom_json_formatter.py b/...ch-inference/src/custom_json_formatter.py → ...ch-inference/src/custom_json_formatter.py
diff --git a/...ith-vllm/batch-inference/src/logging.conf → ...erencing/batch-inference/src/logging.conf b/...ith-vllm/batch-inference/src/logging.conf → ...erencing/batch-inference/src/logging.conf
diff --git a/...vllm/batch-inference/src/requirements.txt → ...cing/batch-inference/src/requirements.txt b/...vllm/batch-inference/src/requirements.txt → ...cing/batch-inference/src/requirements.txt
diff --git a/...ch-inference/src/run_batch_predictions.py → ...ch-inference/src/run_batch_predictions.py b/...ch-inference/src/run_batch_predictions.py → ...ch-inference/src/run_batch_predictions.py
diff --git a/...ng/serving-with-vllm/benchmarks/README.md → use-cases/inferencing/benchmarks/README.md b/...ng/serving-with-vllm/benchmarks/README.md → use-cases/inferencing/benchmarks/README.md
diff --git a/...g/serving-with-vllm/benchmarks/locust.jpg → use-cases/inferencing/benchmarks/locust.jpg b/...g/serving-with-vllm/benchmarks/locust.jpg → use-cases/inferencing/benchmarks/locust.jpg
diff --git a/...erving-with-vllm/benchmarks/locustfile.py → ...ases/inferencing/benchmarks/locustfile.py b/...erving-with-vllm/benchmarks/locustfile.py → ...ases/inferencing/benchmarks/locustfile.py
diff --git a/...anifests/inference-scale/gpu_metrics.yaml → ...erencing/inference-scale/gpu_metrics.yaml b/...anifests/inference-scale/gpu_metrics.yaml → ...erencing/inference-scale/gpu_metrics.yaml
diff --git a/...nce-scale/hpa_vllm_openai_batch_size.yaml → ...nce-scale/hpa_vllm_openai_batch_size.yaml b/...nce-scale/hpa_vllm_openai_batch_size.yaml → ...nce-scale/hpa_vllm_openai_batch_size.yaml
diff --git a/...le/cloud-monitoring-metrics-inference.png → ...es/cloud-monitoring-metrics-inference.png b/...le/cloud-monitoring-metrics-inference.png → ...es/cloud-monitoring-metrics-inference.png
diff --git a/...manifests/inference-scale/gpu-metrics.png → ...ng/inference-scale/images/gpu-metrics.png b/...manifests/inference-scale/gpu-metrics.png → ...ng/inference-scale/images/gpu-metrics.png
diff --git a/use-cases/inferencing/serving-with-vllm/README.md b/use-cases/inferencing/serving-with-vllm/README.md
diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/README.md b/use-cases/inferencing/serving/vllm/gcsfuse/README.md
@@ -0,0 +1,177 @@
+# Distributed Inferencing on vLLM
+
+There are three common strategies for inference on vLLM:
+
+- Single GPU (no distributed inference)
+- Single-Node Multi-GPU (tensor parallel inference)
+- Multi-Node Multi-GPU
+
+In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE.
+
+- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4.
+
+By the end of this guide, you should be able to perform the following steps:
+
+- Deploy a vLLM container to your cluster to host your model
+- Use vLLM to serve the fine-tuned Gemma model
+- View Production metrics for your model serving
+- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model
+
+## Prerequisites
+
+- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment.
+- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md)
+
+## Preparation
+
+- Clone the repository
+
+  ```sh
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms
+  ```
+
+- Change directory to the guide directory
+
+  ```sh
+  cd use-cases/inferencing/serving/vllm/gcsfuse
+  ```
+
+- Ensure that your `MLP_ENVIRONMENT_FILE` is configured
+
+  ```sh
+  cat ${MLP_ENVIRONMENT_FILE} && \
+  source ${MLP_ENVIRONMENT_FILE}
+  ```
+
+  > You should see the various variables populated with the information specific to your environment.
+
+- Configure the environment
+
+  | Variable        | Description                              | Example      |
+  | --------------- | ---------------------------------------- | ------------ |
+  | SERVE_KSA       | The Kubernetes service account           | ml-serve-gcs |
+  | SERVE_NAMESPACE | Namespace where the model will be served | ml-serve     |
+
+  ```sh
+  SERVE_KSA=ml-serve-gcs
+  SERVE_NAMESPACE=ml-serve
+  ```
+
+- Get Credentials for the GKE cluster
+
+  ```sh
+  gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID}
+  ```
+
+- Create the namespace
+
+  ```sh
+  kubectl create ns ${SERVE_NAMESPACE}
+  kubectl create sa ${SERVE_KSA} -n ${SERVE_NAMESPACE}
+  gcloud storage buckets add-iam-policy-binding "gs://${MLP_MODEL_BUCKET}" \
+  --member "principal://iam.googleapis.com/projects/${MLP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${MLP_PROJECT_ID}.svc.id.goog/subject/ns/${SERVE_NAMESPACE}/sa/${SERVE_KSA}" \
+  --role "roles/storage.objectViewer"
+  ```
+
+## Prepare the Persistent Disk (PD)
+
+Loading model weights from a PersistentVolume is a method to load models faster. In GKE, PersistentVolumes backed by Google Cloud Persistent Disks can be mounted read-only simultaneously by multiple nodes (ReadOnlyMany), this allows multiple pods access to the model weights from a single volume.
+
+- Configure the environment
+
+  | Variable      | Description                                                                                  | Example       |
+  | ------------- | -------------------------------------------------------------------------------------------- | ------------- |
+  | ACCELERATOR   | Type of GPU accelerator to use (l4, a100, h100)                                              | l4            |
+  | MODEL_NAME    | The name of the model folder in the root of the GCS model bucket                             | model-gemma2  |
+  | MODEL_VERSION | The name of the version folder inside the model folder of the GCS model bucket               | experiment    |
+  | ZONE          | GCP zone where you have accelerators available. The zone must be in the region ${MLP_REGION} | us-central1-a |
+
+  ```sh
+  ACCELERATOR=l4
+  MODEL_NAME=model-gemma2
+  MODEL_VERSION=experiment
+  ZONE=us-central1-a
+  ```
+
+## Serve the model with vLLM
+
+- Configure the deployment
+
+  ```
+  VLLM_IMAGE_NAME="vllm/vllm-openai:v0.6.3.post1"
+  ```
+
+  ```sh
+  sed \
+  -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \
+  -i -e "s|V_IMAGE_NAME|${VLLM_IMAGE_NAME}|" \
+  -i -e "s|V_KSA|${SERVE_KSA}|" \
+  manifests/model-deployment-${ACCELERATOR}.yaml
+  ```
+
+- Create the deployment
+
+  ```
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/model-deployment-${ACCELERATOR}.yaml
+  ```
+
+- Wait for the deployment to be ready
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} wait --for=condition=ready --timeout=900s pod --selector app=vllm-openai-gcs-${ACCELERATOR}
+  ```
+
+## Serve the model through a web chat interface
+
+- Configure the deployment
+
+  ```sh
+  sed \
+  -i -e "s|V_ACCELERATOR|${ACCELERATOR}|g" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|g" \
+  manifests/gradio.yaml
+  ```
+
+- Create the deployment
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/gradio.yaml
+  ```
+
+- Verify the deployment is ready
+
+- Access the chat interface
+
+  ```sh
+  echo -e "\nGradio chat interface: ${MLP_GRADIO_NAMESPACE_ENDPOINT}\n"
+  ```
+
+- Enter the following prompt in the chat text box to get the response from the model.
+
+  ```
+  I'm looking for comfortable cycling shorts for women, what are some good options?
+  ```
+
+## Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the system. For more information about accessing these metrics see [vLLM Metrics](/use-cases/inferencing/serving/vllm/metrics/README.md).
+
+### Run Batch inference on GKE
+
+Once a model has completed fine-tuning and is deployed on GKE , you can run batch inference on it. Follow the instructions in [batch-inference readme](/use-cases/inferencing/batch-inference/README.md) to run batch inference.
+
+### Run benchmarks for inference
+
+The model is ready to run the benchmarks for inference job. Follow [benchmark readme](/use-cases/inferencing/benchmarks/README.md) to run inference benchmarks on our model.
+
+### Inference at Scale
+
+You can configure Horizontal Pod Autoscaler to scale your inference deployment based
+on relevant metrics. Follow the instructions on
+[inference at scale reademe](./inference-scale/README.md) to scale your
+deployed model.
diff --git a/...g/serving-with-vllm/manifests/gradio.yaml → ...erving/vllm/gcsfuse/manifests/gradio.yaml b/...g/serving-with-vllm/manifests/gradio.yaml → ...erving/vllm/gcsfuse/manifests/gradio.yaml
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: gradio
   labels:
     app: gradio
+  name: gradio
 spec:
   replicas: 1
   selector:
@@ -29,42 +29,46 @@ spec:
         app: gradio
     spec:
       containers:
-      - name: gradio
-        image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3
-        resources:
-          requests:
-            cpu: "250m"
-            memory: "512Mi"
-          limits:
-            cpu: "500m"
-            memory: "512Mi"
-        env:
+      - env:
         - name: CONTEXT_PATH
-          value: "/v1/chat/completions"
+          value: /v1/chat/completions
         - name: HOST
-          value: "http://vllm-openai:8000"
+          value: http://vllm-openai-gcs-V_ACCELERATOR:8000
         - name: DISABLE_SYSTEM_MESSAGE
-          value: "true"
+          value: 'true'
         - name: LLM_ENGINE
-          value: "openai"
+          value: openai
         - name: MODEL_ID
-          value: V_MODEL_ID
+          value: /gcs/V_MODEL_NAME/V_MODEL_VERSION
         - name: USER_PROMPT
-          value: "<start_of_turn>user\nprompt<end_of_turn>\n"
+          value: |
+            <start_of_turn>user
+            prompt<end_of_turn>
         - name: SYSTEM_PROMPT
-          value: "<start_of_turn>model\nprompt<end_of_turn>\n"
+          value: |
+            <start_of_turn>model
+            prompt<end_of_turn>
+        image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3
+        name: gradio
         ports:
         - containerPort: 7860
+        resources:
+          limits:
+            cpu: 500m
+            memory: 512Mi
+          requests:
+            cpu: 250m
+            memory: 512Mi
 ---
 apiVersion: v1
 kind: Service
 metadata:
   name: gradio
 spec:
-  selector:
-    app: gradio
   ports:
-  - protocol: TCP
-    port: 8080
+  - port: 8080
+    protocol: TCP
     targetPort: 7860
-  type: LoadBalancer
+  selector:
+    app: gradio
+  type: LoadBalancer
diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml
@@ -0,0 +1,107 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-openai-gcs-a100
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-openai-gcs-a100
+  template:
+    metadata:
+      labels:
+        app: vllm-openai-gcs-a100
+      annotations:
+        gke-gcsfuse/volumes: "true"
+    spec:
+      containers:
+      - name: inference-server
+        args:
+        - --model=$(MODEL)
+        - --tensor-parallel-size=2
+        env:
+        - name: MODEL
+          value: /gcs/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_IMAGE_NAME
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          requests:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "25Gi"
+            nvidia.com/gpu: "2"
+          limits:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "25Gi"
+            nvidia.com/gpu: "2"
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - name: gcs-fuse-csi-ephemeral
+          mountPath: /gcs
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-tesla-a100
+      serviceAccountName: V_KSA
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+      - key: "on-demand"
+        value: "true"
+        operator: "Equal"
+        effect: "NoSchedule" 
+      volumes:
+      - name: dshm
+        emptyDir:
+            medium: Memory
+      - name: gcs-fuse-csi-ephemeral
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: V_MODEL_BUCKET
+            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" 
+            fileCacheCapacity: "20Gi"
+            fileCacheForRangeRead: "true"
+            metadataStatCacheCapacity: "-1"
+            metadataTypeCacheCapacity: "-1"
+            metadataCacheTTLSeconds: "-1"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-openai-gcs-a100
+spec:
+  selector:
+    app: vllm-openai-gcs-a100
+  type: ClusterIP
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000