Restructured and standardized READMEs

GoogleCloudPlatform · Nov 8, 2024 · 724f397 · 724f397
1 parent 38f0b4e
commit 724f397
Show file tree

Hide file tree

Showing 37 changed files with 1,532 additions and 509 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,9 @@ __pycache__/
 .venv/
 venv/
 
+# Repositories
+monitoring-dashboard-samples/
+
 # Terraform
 *.terraform/
 *.terraform-*/

diff --git a/...rving-with-vllm/batch-inference/README.md → ...ses/inferencing/batch-inference/README.md b/...rving-with-vllm/batch-inference/README.md → ...ses/inferencing/batch-inference/README.md
diff --git a/...m/batch-inference/example_predictions.txt → ...g/batch-inference/example_predictions.txt b/...m/batch-inference/example_predictions.txt → ...g/batch-inference/example_predictions.txt
diff --git a/...-inference/manifests/batch_inference.yaml → ...-inference/manifests/batch_inference.yaml b/...-inference/manifests/batch_inference.yaml → ...-inference/manifests/batch_inference.yaml
diff --git a/...-with-vllm/batch-inference/src/Dockerfile → ...nferencing/batch-inference/src/Dockerfile b/...-with-vllm/batch-inference/src/Dockerfile → ...nferencing/batch-inference/src/Dockerfile
diff --git a/...-vllm/batch-inference/src/cloudbuild.yaml → ...ncing/batch-inference/src/cloudbuild.yaml b/...-vllm/batch-inference/src/cloudbuild.yaml → ...ncing/batch-inference/src/cloudbuild.yaml
diff --git a/...ch-inference/src/custom_json_formatter.py → ...ch-inference/src/custom_json_formatter.py b/...ch-inference/src/custom_json_formatter.py → ...ch-inference/src/custom_json_formatter.py
diff --git a/...ith-vllm/batch-inference/src/logging.conf → ...erencing/batch-inference/src/logging.conf b/...ith-vllm/batch-inference/src/logging.conf → ...erencing/batch-inference/src/logging.conf
diff --git a/...vllm/batch-inference/src/requirements.txt → ...cing/batch-inference/src/requirements.txt b/...vllm/batch-inference/src/requirements.txt → ...cing/batch-inference/src/requirements.txt
diff --git a/...ch-inference/src/run_batch_predictions.py → ...ch-inference/src/run_batch_predictions.py b/...ch-inference/src/run_batch_predictions.py → ...ch-inference/src/run_batch_predictions.py
diff --git a/...ng/serving-with-vllm/benchmarks/README.md → use-cases/inferencing/benchmark/README.md b/...ng/serving-with-vllm/benchmarks/README.md → use-cases/inferencing/benchmark/README.md
diff --git a/...g/serving-with-vllm/benchmarks/locust.jpg → use-cases/inferencing/benchmark/locust.jpg b/...g/serving-with-vllm/benchmarks/locust.jpg → use-cases/inferencing/benchmark/locust.jpg
diff --git a/...erving-with-vllm/benchmarks/locustfile.py → ...cases/inferencing/benchmark/locustfile.py b/...erving-with-vllm/benchmarks/locustfile.py → ...cases/inferencing/benchmark/locustfile.py
diff --git a/use-cases/inferencing/serving-with-vllm/README.md b/use-cases/inferencing/serving-with-vllm/README.md
diff --git a/use-cases/inferencing/serving/vllm/autoscaling/README.md b/use-cases/inferencing/serving/vllm/autoscaling/README.md
@@ -0,0 +1,160 @@
+# Inferencing at scale
+
+## Pre-requisites
+
+- A model is deployed using one of the vLLM guides
+  - [Serving the mode using vLLM and GCSFuse](/use-cases/inferencing/serving/vllm/gcsfuse/README.md)
+  - [Serving the mode using vLLM and Persistent Disk](/use-cases/inferencing/serving/vllm/persistent-disk/README.md)
+- Metrics are being scraped from the vLLM server ss shown in the [vLLM Metrics](/use-cases/inferencing/serving/vllm/metrics/README.md) guide.
+
+## Preparation
+
+- Clone the repository
+
+  ```sh
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms
+  ```
+
+- Change directory to the guide directory
+
+  ```sh
+  cd use-cases/inferencing/serving/vllm/autoscaling
+  ```
+
+- Ensure that your `MLP_ENVIRONMENT_FILE` is configured
+
+  ```sh
+  cat ${MLP_ENVIRONMENT_FILE} && \
+  source ${MLP_ENVIRONMENT_FILE}
+  ```
+
+  > You should see the various variables populated with the information specific to your environment.
+
+- Configure the environment
+
+  | Variable        | Description                              | Example  |
+  | --------------- | ---------------------------------------- | -------- |
+  | SERVE_NAMESPACE | Namespace where the model will be served | ml-serve |
+
+  ```sh
+  SERVE_NAMESPACE=ml-serve
+  ```
+
+## Scaling metrics
+
+There are different metrics available that cam be used to scale your inference workload on GKE:
+
+- Server metrics: LLM inference servers vLLM provides workload-specific
+  performance metrics. GKE simplifies scraping of those metrics and autoscaling
+  the workloads based on these server-level metrics. You can use these metrics to
+  gain visibility into performance indicators like batch size, queue size, and
+  decode latencies.
+
+  In the case of vLLM, [production metrics class](https://docs.vllm.ai/en/latest/serving/metrics.html)
+  exposes a number of useful metrics which GKE can use to horizontally scale
+  inference workloads.
+
+  - `vllm:num_requests_running` - Number of requests currently running on GPU.
+  - `vllm:num_requests_waiting `- Number of requests waiting to be processed
+
+    Here is an example of the metric `vllm:num_requests_running` in metrics explorer
+    ![metrics graph](images/cloud-monitoring-metrics-inference.png)
+
+- GPU metrics: Metrics related to the GPU utilization.
+
+  - GPU Utilization (`DCGM_FI_DEV_GPU_UTIL`) - Measures the duty cycle, which is the
+    amount of time that the GPU is active.
+
+  - GPU Memory Usage (`DCGM_FI_DEV_FB_USED`) - Measures how much GPU memory is being
+    used at a given point in time. This is useful for workloads that implement
+    dynamic allocation of GPU memory.=
+
+- CPU metrics: Since the inference workloads primarily rely on GPU resources,
+  we don't recommend CPU and memory utilization as the only indicators of the
+  amount of resources a job consumes. Therefore, using CPU metrics alone for
+  autoscaling can lead to suboptimal performance and costs.
+
+HPA is an efficient way to ensure that your model servers scale appropriately
+with load. Fine-tuning the HPA settings is the primary way to align your
+provisioned hardware cost with traffic demands to achieve your inference server
+performance goals.
+
+We recommend setting these HPA configuration options:
+
+- Stabilization window: Use this HPA configuration option to prevent rapid
+  replica count changes due to fluctuating metrics. Defaults are 5 minutes for
+  scale-down (avoiding premature scale-down) and 0 for scale-up (ensuring responsiveness).
+  Adjust the value based on your workload's volatility and your preferred responsiveness.
+
+- Scaling policies: Use this HPA configuration option to fine-tune the scale-up
+  and scale-down behavior. You can set the "Pods" policy limit to specify the
+  absolute number of replicas changed per time unit, and the "Percent" policy
+  limit to specify by the percentage change.
+
+For more details, see Horizontal pod autoscaling in the Google Cloud Managed
+Service for Prometheus [documentation](https://cloud.google.com/kubernetes-engine/docs/horizontal-pod-autoscaling).
+
+### Autoscale with HPA metrics
+
+- Install the Custom Metrics Adapter. This adapter makes the custom metric that you
+  exported to Cloud Monitoring visible to the HPA. For more details, see the [Horizontal pod autoscaling (HPA)](https://cloud.google.com/stackdriver/docs/managed-prometheus/hpa)
+  document in the [Google Cloud Managed Service for Prometheus (GMP)](https://cloud.google.com/stackdriver/docs/managed-prometheus) documentation.
+
+  ```sh
+   kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml
+  ```
+
+- Deploy an metric based HPA resource that based on your preferred custom metric.
+
+  Choose one of the options below `Queue-depth` or `Batch-size` to configure
+  the HPA resource in your manifest:
+
+  - Queue-depth
+
+    ```sh
+    kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/hpa-vllm-openai-queue-size.yaml
+    ```
+
+  - Batch-size
+
+    ```sh
+    kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/hpa-vllm-openai-batch-size.yaml
+    ```
+
+  > NOTE: Adjust the appropriate target values for `vllm:num_requests_running`
+  > or `vllm:num_requests_waiting` in the yaml file.
+
+  Once the HPA has been created on a given metric, GKE will autoscale the model
+  deployment pods when the metric goes over the specified threshold.
+  It will look something like the following:
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} get hpa vllm-openai-hpa --watch
+  NAME              REFERENCE                TARGETS   MINPODS   MAXPODS   REPLICAS   AGE
+  vllm-openai-hpa   Deployment/vllm-openai   1/1       1         5         1          27s
+  vllm-openai-hpa   Deployment/vllm-openai   0/1       1         5         1          76s
+  vllm-openai-hpa   Deployment/vllm-openai   1/1       1         5         1          95s
+  ```
+
+You can also see the new pods coming online:
+
+```sh
+kubectl --namespace ${SERVE_NAMESPACE} get pods --watch
+NAME                           READY   STATUS      RESTARTS   AGE
+vllm-openai-767b477b77-2jm4v   1/1     Running     0          3d17h
+vllm-openai-767b477b77-82l8v   0/1     Pending     0          9s
+```
+
+And eventually, the pods will be scaled up:
+
+```sh
+kubectl get pods -n ml-serve --watch
+NAME                           READY   STATUS      RESTARTS   AGE
+vllm-openai-767b477b77-2jm4v   1/1     Running     0          3d17h
+vllm-openai-767b477b77-82l8v   1/1     Running     0          111s
+```
+
+If there are GPU resources available on the same node, the new pod may start on
+it. Otherwise, a new node will be spun up by the autoscaler with the required
+resources and the new pod will be started on it.
diff --git a/...le/cloud-monitoring-metrics-inference.png → ...es/cloud-monitoring-metrics-inference.png b/...le/cloud-monitoring-metrics-inference.png → ...es/cloud-monitoring-metrics-inference.png
diff --git a/...manifests/inference-scale/gpu-metrics.png → ...g/vllm/autoscaling/images/gpu-metrics.png b/...manifests/inference-scale/gpu-metrics.png → ...g/vllm/autoscaling/images/gpu-metrics.png
diff --git a/...anifests/inference-scale/gpu_metrics.yaml → ...utoscaling/manifests/hpa-gpu-metrics.yaml b/...anifests/inference-scale/gpu_metrics.yaml → ...utoscaling/manifests/hpa-gpu-metrics.yaml
@@ -15,22 +15,22 @@
 apiVersion: monitoring.googleapis.com/v1
 kind: PodMonitoring
 metadata:
-  name: nvidia-dcgm-exporter-for-hpa
   labels:
     app.kubernetes.io/name: nvidia-dcgm-exporter
     app.kubernetes.io/part-of: google-cloud-managed-prometheus
+  name: nvidia-dcgm-exporter-for-hpa
 spec:
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: nvidia-dcgm-exporter
   endpoints:
-  - port: metrics
-    interval: 15s
+  - interval: 15s
     metricRelabeling:
     - action: keep
       sourceLabels: [__name__]
     - action: replace
-      sourceLabels: [__name__]
-      targetLabel: __name__
       regex: DCGM_FI_DEV_GPU_UTIL
       replacement: dcgm_fi_dev_gpu_util
+      sourceLabels: [__name__]
+      targetLabel: __name__
+    port: metrics
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: nvidia-dcgm-exporter
diff --git a/...nce-scale/hpa_vllm_openai_batch_size.yaml → ...manifests/hpa-vllm-openai-batch-size.yaml b/...nce-scale/hpa_vllm_openai_batch_size.yaml → ...manifests/hpa-vllm-openai-batch-size.yaml
@@ -17,17 +17,17 @@ kind: HorizontalPodAutoscaler
 metadata:
   name: vllm-openai-hpa
 spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: vllm-openai
-  minReplicas: 1
   maxReplicas: 5
   metrics:
-  - type: Pods
-    pods:
+  - pods:
       metric:
         name: prometheus.googleapis.com|vllm:num_requests_running|gauge
       target:
+        averageValue: 10
         type: AverageValue
-        averageValue: V_AVERAGE_VALUE
+    type: Pods
+  minReplicas: 1
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: vllm-openai
diff --git a/use-cases/inferencing/serving/vllm/autoscaling/manifests/hpa-vllm-openai-queue-size.yaml b/use-cases/inferencing/serving/vllm/autoscaling/manifests/hpa-vllm-openai-queue-size.yaml
@@ -0,0 +1,33 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: vllm-openai-hpa
+spec:
+  maxReplicas: 5
+  metrics:
+  - pods:
+      metric:
+        name: prometheus.googleapis.com|vllm:num_requests_waiting|gauge
+      target:
+        averageValue: 10
+        type: AverageValue
+    type: Pods
+  minReplicas: 1
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: vllm-openai