diff --git a/.gitignore b/.gitignore index c2305b35..d1dcf0df 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ __pycache__/ .venv/ venv/ +# Repositories +monitoring-dashboard-samples/ + # Terraform *.terraform/ *.terraform-*/ diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/README.md b/use-cases/inferencing/batch-inference/README.md similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/README.md rename to use-cases/inferencing/batch-inference/README.md diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/example_predictions.txt b/use-cases/inferencing/batch-inference/example_predictions.txt similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/example_predictions.txt rename to use-cases/inferencing/batch-inference/example_predictions.txt diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/manifests/batch_inference.yaml b/use-cases/inferencing/batch-inference/manifests/batch_inference.yaml similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/manifests/batch_inference.yaml rename to use-cases/inferencing/batch-inference/manifests/batch_inference.yaml diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/Dockerfile b/use-cases/inferencing/batch-inference/src/Dockerfile similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/Dockerfile rename to use-cases/inferencing/batch-inference/src/Dockerfile diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/cloudbuild.yaml b/use-cases/inferencing/batch-inference/src/cloudbuild.yaml similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/cloudbuild.yaml rename to use-cases/inferencing/batch-inference/src/cloudbuild.yaml diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/custom_json_formatter.py b/use-cases/inferencing/batch-inference/src/custom_json_formatter.py similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/custom_json_formatter.py rename to use-cases/inferencing/batch-inference/src/custom_json_formatter.py diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/logging.conf b/use-cases/inferencing/batch-inference/src/logging.conf similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/logging.conf rename to use-cases/inferencing/batch-inference/src/logging.conf diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/requirements.txt b/use-cases/inferencing/batch-inference/src/requirements.txt similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/requirements.txt rename to use-cases/inferencing/batch-inference/src/requirements.txt diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/run_batch_predictions.py b/use-cases/inferencing/batch-inference/src/run_batch_predictions.py similarity index 100% rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/run_batch_predictions.py rename to use-cases/inferencing/batch-inference/src/run_batch_predictions.py diff --git a/use-cases/inferencing/serving-with-vllm/benchmarks/README.md b/use-cases/inferencing/benchmarks/README.md similarity index 100% rename from use-cases/inferencing/serving-with-vllm/benchmarks/README.md rename to use-cases/inferencing/benchmarks/README.md diff --git a/use-cases/inferencing/serving-with-vllm/benchmarks/locust.jpg b/use-cases/inferencing/benchmarks/locust.jpg similarity index 100% rename from use-cases/inferencing/serving-with-vllm/benchmarks/locust.jpg rename to use-cases/inferencing/benchmarks/locust.jpg diff --git a/use-cases/inferencing/serving-with-vllm/benchmarks/locustfile.py b/use-cases/inferencing/benchmarks/locustfile.py similarity index 100% rename from use-cases/inferencing/serving-with-vllm/benchmarks/locustfile.py rename to use-cases/inferencing/benchmarks/locustfile.py diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu_metrics.yaml b/use-cases/inferencing/inference-scale/gpu_metrics.yaml similarity index 100% rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu_metrics.yaml rename to use-cases/inferencing/inference-scale/gpu_metrics.yaml diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/hpa_vllm_openai_batch_size.yaml b/use-cases/inferencing/inference-scale/hpa_vllm_openai_batch_size.yaml similarity index 100% rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/hpa_vllm_openai_batch_size.yaml rename to use-cases/inferencing/inference-scale/hpa_vllm_openai_batch_size.yaml diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/cloud-monitoring-metrics-inference.png b/use-cases/inferencing/inference-scale/images/cloud-monitoring-metrics-inference.png similarity index 100% rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/cloud-monitoring-metrics-inference.png rename to use-cases/inferencing/inference-scale/images/cloud-monitoring-metrics-inference.png diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu-metrics.png b/use-cases/inferencing/inference-scale/images/gpu-metrics.png similarity index 100% rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu-metrics.png rename to use-cases/inferencing/inference-scale/images/gpu-metrics.png diff --git a/use-cases/inferencing/serving-with-vllm/README.md b/use-cases/inferencing/serving-with-vllm/README.md deleted file mode 100644 index 50d548e6..00000000 --- a/use-cases/inferencing/serving-with-vllm/README.md +++ /dev/null @@ -1,418 +0,0 @@ -# Distributed Inferencing on vLLM - -There are three common strategies for inference on vLLM: - -- Single GPU (no distributed inference) -- Single-Node Multi-GPU (tensor parallel inference) -- Multi-Node Multi-GPU - -In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE. - -- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don’t need to use distributed inference. Just use the single GPU to run the inference. - -- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4. - -By the end of this guide, you should be able to perform the following steps: - -- Create a Persistent Disk for the LLM model weights -- Deploy a vLLM container to your cluster to host your model -- Use vLLM to serve the fine-tuned Gemma model -- View Production metrics for your model serving -- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model - -## Prerequisites - -- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment. -- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md) - -## Preparation - -- Clone the repository and change directory to the guide directory - - ```sh - git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \ - cd accelerated-platforms/use-cases/inferencing/serving-with-vllm - ``` - -- Ensure that your `MLP_ENVIRONMENT_FILE` is configured - - ```sh - cat ${MLP_ENVIRONMENT_FILE} && \ - source ${MLP_ENVIRONMENT_FILE} - ``` - - > You should see the various variables populated with the information specific to your environment. - -- Set environment variables - - ```sh - MLP_PROJECT_ID= - PROJECT_NUMBER=$(gcloud projects describe ${PROJECT_ID} --format="value(projectNumber)") - V_MODEL_BUCKET= - MLP_CLUSTER_NAME= - SERVE_NAMESPACE=ml-serve # SERVE_NAMESPACE functions to serving the model - OPS_NAMESPACE=ml-ops # OPS_NAMESPACE functions to download/upload model artifacts and dataset to and from GCS and artifact registry - MODEL_ID= # example : model-gemma2-a100 - MODEL_DIR_PATH ="" # location to copy the model artifacts from - REGION= - ZONE= - ACCELERATOR_TYPE= # nvidia-l4 | nvidia-tesla-a100 - ``` - -- Get Credentials for the GKE cluster - - ```sh - gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID} - ``` - -- Grant permission to kubernetes service account in cluster to access the storage bucket to view model weights - - ```sh - kubectl create ns ${SERVE_NAMESPACE} - kubectl create ns ${OPS_NAMESPACE} - kubectl create sa $KSA -n ${OPS_NAMESPACE} # KSA to download model artifacts from GCS - gcloud storage buckets add-iam-policy-binding "gs://$V_MODEL_BUCKET" \ - --member "principal://iam.googleapis.com/projects/"${PROJECT_NUMBER}"/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/$OPS_NAMESPACE/sa/$KSA" \ - --role "roles/storage.objectViewer" - ``` - -- Update the bucket access level to uniform. - - ```sh - gcloud storage buckets update "gs://$V_MODEL_BUCKET" --uniform-bucket-level-access - ``` - -## Create PV, PVC and Persistent disk - -Loading model weights from a Persistent Volume is a method to load models faster. In GKE, Persistent Volumes backed by GCP Persistent Disks can be mounted read-only simultaneously by multiple nodes(ReadOnlyMany), this allows multiple pods access the model weights from a single volume. - - Create a PVC for the model weights - - ```sh - kubectl apply -f manifests/volume-prep/pvc_disk_image.yaml -n ${OPS_NAMESPACE} - ``` - - Create a job downloading the models to the volume and review logs for successful completion. - - ```sh - sed -i -e "s|V_KSA|${KSA}|" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml - sed -i -e "s|V_MODEL_BUCKET|${MODEL_BUCKET}|" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml - sed -i -e "s|V_MODEL_ID|${MODEL_ID}|g" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml - sed -i -e "s|V_MODEL_DIR_PATH|${MODEL_DIR_PATH}|" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml - kubectl create -f manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml -n ${OPS_NAMESPACE} - ``` - - Wait for the job to show completion status. - - ```sh - kubectl get jobs -n ${OPS_NAMESPACE} --watch - ``` - - ``` - NAME STATUS COMPLETIONS DURATION AGE - module-download-job-vl7cc Running 0/1 4m1s 4m1s - module-download-job-vl7cc Running 0/1 4m31s 4m31s - module-download-job-vl7cc Running 0/1 4m35s 4m35s - module-download-job-vl7cc Complete 1/1 4m35s 4m35s - ``` - -You can also check pod logs to check the progress of disk creation. - - ```sh - kubectl logs module-download-job-vl7cc-km29x -n ${OPS_NAMESPACE} - ``` - - ``` - Creating filesystem with 26214400 4k blocks and 6553600 inodes - Filesystem UUID: 8eec47d3-920a-423c-919a-959f016d50cb - Superblock backups stored on blocks: - 32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, - 4096000, 7962624, 11239424, 20480000, 23887872 - - Allocating group tables: done - Writing inode tables: done - Creating journal (131072 blocks): done - Writing superblocks and filesystem accounting information: done - - /mnt/model-gemma2-a100: - total 4 - drwxr-xr-x 3 root root 4096 Oct 25 22:00 experiment-a2aa2c3it1 - ``` - - Create the PV and PVC - - ```sh - PV_NAME="$(kubectl get pvc/block-pvc-model -n ${OPS_NAMESPACE} -o jsonpath='{.spec.volumeName}')" - DISK_REF="$(kubectl get pv "$PV_NAME" -n ${OPS_NAMESPACE} -o jsonpath='{.spec.csi.volumeHandle}')" - ``` - - ```sh - gcloud compute images create model-weights-image --source-disk="$DISK_REF" - ``` - - ```sh - gcloud compute disks create models-fine-tune-disk-v1 --size=1TiB --type=pd-ssd --zone=${ZONE} --image=model-weights-image - ``` - - > Note: Choose the appropriate zone based on cluster node location and GPU availability - - ```sh - VOLUME_HANDLE="projects/${MLP_PROJECT_ID}/zones/${ZONE}/disks/models-fine-tune-disk-v1" - sed -i -e "s|V_VOLUME_HANDLE|${VOLUME_HANDLE}|" manifests/volume-prep/persistent_volume.yaml - sed -i -e "s|V_ZONE|${ZONE}|" manifests/volume-prep/persistent_volume.yaml - kubectl apply -f manifests/volume-prep/persistent_volume.yaml # PVs are namespace-less - kubectl apply -f manifests/volume-prep/persistent_volume_claim.yaml -n ${SERVE_NAMESPACE} # Deploy PVC claim where you serve the model - ``` - -## Deploy a vLLM container to your cluster - -- Run the batch job to deploy model using persistent disk on GKE. - There are few variables that need to be updated before you can run this job to download model on your persistent disk - - - Here is an example - - ```sh - mkdir -p /mnt/model-gemma2-a100 - cp -r /data/models/model-gemma2-a100/experiment-a2aa2c3it1 /mnt/model-gemma2-a100 # location to copy the model artifacts from - ls -lR /mnt/model-gemma2-a100 # location to copy the model artifacts - ``` - - ```sh - sed -i -e "s|V_MODEL_ID|${MODEL_ID}|" manifests/model_deployment.yaml - sed -i -e "s|V_MODEL_DIR_PATH|${MODEL_DIR_PATH}|" manifests/model_deployment.yaml - sed -i -e "s|V_ACCELERATOR_TYPE|${ACCELERATOR_TYPE}|" manifests/model_deployment.yaml - ``` - - ```sh - kubectl create -f manifests/model_deployment.yaml -n ${SERVE_NAMESPACE} - kubectl logs -f -l app=vllm-openai -n ${SERVE_NAMESPACE} - ``` - - ```sh - INFO: Started server process [1] - INFO: Waiting for application startup. - INFO: Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) - ``` - -## Serve the deployed model through curl and a web chat interface - -- Test your deployed model through the CLI - - ```sh - kubectl port-forward svc/vllm-openai -n ${SERVE_NAMESPACE} 8000 - ``` - - Run the curl prompt with your values - - ```sh - USER_PROMPT="I'm looking for comfortable cycling shorts for women, what are some good options?" - MODEL_ID="" - ``` - - ``` - curl http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "${MODEL_ID}", - "messages": [ - {"role": "user", "content": "${USER_PROMPT}"}], - "temperature": 0.70, - "top_p": 1.0, - "top_k": 1.0, - "max_tokens": 256 - }' - ``` - -- You can also deploy a gradio chat interface to view the model chat interface. [OPTIONAL] - - ```sh - sed -i -e "s|V_MODEL_ID|${MODEL_ID}|" manifests/gradio.yaml - ``` - - ```sh - kubectl apply -f manifests/gradio.yaml -n ${SERVE_NAMESPACE} - ``` - -### Production Metrics - -vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. - - ```sh - curl http://vllm-openai:8000/metrics - ``` - -### View vLLM serving metrics for your model on GKE - -You can configure monitoring of the metrics above using the [pod monitoring](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed#gmp-pod-monitoring) - - ```sh - kubectl apply -f manifests/pod_monitoring.yaml -n ${SERVE_NAMESPACE} - ``` - -### Create a dashboard for Cloud Monitoring to view vLLM metrics - -Cloud Monitoring provides an [importer](https://cloud.google.com/monitoring/dashboards/import-grafana-dashboards) that you can use to import dashboard files in the Grafana JSON format into Cloud Monitoring - - Clone github repository - - ```sh - git clone https://github.com/GoogleCloudPlatform/monitoring-dashboard-samples - ``` - - Change to the directory for the dashboard importer: - - ```sh - cd monitoring-dashboard-samples/scripts/dashboard-importer - ``` - -The dashboard importer includes the following scripts: - -- import.sh, which converts dashboards and optionally uploads the converted dashboards to Cloud Monitoring. -- upload.sh, which uploads the converted dashboards—or any Monitoring dashboards—to Cloud Monitoring. The import.sh script calls this script to do the upload. - - Import the dashboard - - ```sh - ./import.sh ./configs/grafana.json ${MLP_PROJECT_ID} - ``` - - When you use the import.sh script, you must specify the location of the Grafana dashboards to convert. The importer creates a directory that contains the converted dashboards and other information. - - -### Run Batch inference on GKE - -Once a model has completed fine-tuning and is deployed on GKE , its ready to run batch Inference pipeline. -In this example batch inference pipeline, we would first send prompts to the hosted fine-tuned model and then validate the results based on ground truth. - -Please follow ```use-cases/inferencing/serving-with-vllm/batch-inference/README.md``` for instructions. - -### Run benchmarks for inference - -Please follow ```use-cases/inferencing/serving-with-vllm/benchmarks/README.md``` for instructions. - -### Inference at Scale - -There are different metrics available that could be used to scale your inference workloads on GKE. - -Server metrics: LLM inference servers vLLM provides workload-specific performance metrics. GKE simplifies scraping and autoscaling of workloads based on these server-level metrics. You can use these metrics to gain visibility into performance indicators like batch size, queue size, and decode latencies - -In case of vLLM, [production metrics class](https://docs.vllm.ai/en/latest/serving/metrics.html) exposes a number of useful metrics which GKE can use to horizontally scale inference workloads. - -```sh -vllm:num_requests_running - Number of requests currently running on GPU. -vllm:num_requests_waiting - Number of requests waiting to be processed -``` - -GPU metrics: - -```none -GPU Utilization (DCGM_FI_DEV_GPU_UTIL) - Measures the duty cycle, which is the amount of time that the GPU is active. -GPU Memory Usage (DCGM_FI_DEV_FB_USED) - Measures how much GPU memory is being used at a given point in time. This is useful for workloads that implement dynamic allocation of GPU memory. -``` - -``` -sh -kubectl apply -f manifests/inference-scale/hpa_gpu-metrics.yaml -n ${SERVE_NAMESPACE} -``` - -Here is a sample metrics graph that represent the bGPU metrics for duty cycle. ![metrics graph](./manifests/inference-scale/gpu-metrics.png) to review. - -CPU metrics: Since the inference workloads primarily rely on GPU resources, we don't recommend CPU and memory utilization as the only indicators of the amount of resources a job consumes. Therefore, using CPU metrics alone for autoscaling can lead to suboptimal performance and costs. - -HPA is an efficient way to ensure that your model servers scale appropriately with load. Fine-tuning the HPA settings is the primary way to align your provisioned hardware cost with traffic demands to achieve your inference server performance goals. - -We recommend setting these HPA configuration options: - -- Stabilization window: Use this HPA configuration option to prevent rapid replica count changes due to fluctuating metrics. Defaults are 5 minutes for scale-down (avoiding premature downscaling) and 0 for scale-up (ensuring responsiveness). Adjust the value based on your workload's volatility and your preferred responsiveness. - -- Scaling policies: Use this HPA configuration option to fine-tune the scale-up and scale-down behavior. You can set the "Pods" policy limit to specify the absolute number of replicas changed per time unit, and the "Percent" policy limit to specify by the percentage change. - -For more details, see Horizontal pod autoscaling in the Google Cloud Managed Service for Prometheus [documentation](https://cloud.google.com/kubernetes-engine/docs/horizontal-pod-autoscaling). - -Pre-requisites: - -- GKE cluster running inference workload as shown in previous examples. -- Export the metrics from the vLLM server to Cloud Monitoring as shown in enable monitoring section. - -We have couple of options to scale the inference workload on GKE using the HPA and custom metrics adapter. - -- Scale pod on the same node as the existing inference workload. -- Scale pod on the other nodes in the same node pool as the existing inference workload. - -#### Prepare your environment to autoscale with HPA metrics - -Install the Custom Metrics Adapter. This adapter makes the custom metric that you exported to Cloud Monitoring visible to the HPA. For more details, see HPA in the [Google Cloud Managed Service for Prometheus documentation](https://cloud.google.com/stackdriver/docs/managed-prometheus/hpa). - -The following example command shows how to install the adapter: - - ```sh - kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml - ``` - -Set up the custom metric-based HPA resource. Deploy an HPA resource that is based on your preferred custom metric. - -Here is a sample metrics graph that represent the batch size. ![metrics graph](./manifests/inference-scale/cloud-monitoring-metrics-inference.png) to review. - - -- Batch-size - - ```sh - AVERAGE_VALUE = 10 # Replace it with a value of choice - sed -i -e "s|V_AVERAGE_VALUE|${AVERAGE_VALUE}|" manifests/inference-scale/hpa_vllm_openai_batch_size.yaml - kubectl apply -f manifests/inference-scale/hpa_vllm_openai_batch_size.yaml -n ${SERVE_NAMESPACE} - ``` - -> Note: Below is an example of the batch size HPA scale test below: - -```sh -kubectl get hpa vllm-openai-hpa -n ${SERVE_NAMESPACE} --watch -NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE -vllm-openai-hpa Deployment/vllm-openai 0/10 1 5 1 6d16h -vllm-openai-hpa Deployment/vllm-openai 13/10 1 5 1 6d16h -vllm-openai-hpa Deployment/vllm-openai 17/10 1 5 2 6d16h -vllm-openai-hpa Deployment/vllm-openai 12/10 1 5 2 6d16h -vllm-openai-hpa Deployment/vllm-openai 17/10 1 5 2 6d16h -vllm-openai-hpa Deployment/vllm-openai 14/10 1 5 2 6d16h -vllm-openai-hpa Deployment/vllm-openai 17/10 1 5 2 6d16h -vllm-openai-hpa Deployment/vllm-openai 10/10 1 5 2 6d16h -``` - -```sh -kubectl get pods -n ${SERVE_NAMESPACE} --watch -NAME READY STATUS RESTARTS AGE -gradio-6b8698d7b4-88zm7 1/1 Running 0 10d -model-eval-2sxg2 0/1 Completed 0 8d -vllm-openai-767b477b77-2jm4v 1/1 Running 0 3d17h -vllm-openai-767b477b77-82l8v 0/1 Pending 0 9s -``` - -Pod scaled up -```sh -kubectl get pods -n ${SERVE_NAMESPACE} --watch -NAME READY STATUS RESTARTS AGE -gradio-6b8698d7b4-88zm7 1/1 Running 0 10d -model-eval-2sxg2 0/1 Completed 0 8d -vllm-openai-767b477b77-2jm4v 1/1 Running 0 3d17h -vllm-openai-767b477b77-82l8v 1/1 Running 0 111s -``` - -The new pod is deployed on a node triggered by the cluster autoscaler. -> NOTE: The existing node where inference workload was deployed in this case had only two GPUS. Hence a new node is required to deploy the copy pod of inference workload. - -```sh -kubectl describe pods vllm-openai-767b477b77-82l8v -n ${SERVE_NAMESPACE} - -Events: - Type Reason Age From Message - ---- ------ ---- ---- ------- - Warning FailedScheduling 4m15s gke.io/optimize-utilization-scheduler 0/3 nodes are available: 1 Insufficient ephemeral-storage, 1 Insufficient nvidia.com/gpu, 2 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 1 No preemption victims found for incoming pod, 2 Preemption is not helpful for scheduling. - Normal TriggeredScaleUp 4m13s cluster-autoscaler pod triggered scale-up: [{https://www.googleapis.com/compute/v1/projects/gkebatchexpce3c8dcb/zones/us-east4-a/instanceGroups/gke-kh-e2e-l4-2-c399c5c0-grp 1->2 (max: 20)}] - Normal Scheduled 2m40s gke.io/optimize-utilization-scheduler Successfully assigned ml-serve/vllm-openai-767b477b77-82l8v to gke-kh-e2e-l4-2-c399c5c0-vvm9 - Normal SuccessfulAttachVolume 2m36s attachdetach-controller AttachVolume.Attach succeeded for volume "model-weights-disk-1024gb-zone-a" - Normal Pulling 2m29s kubelet Pulling image "vllm/vllm-openai:v0.5.3.post1" - Normal Pulled 2m25s kubelet Successfully pulled image "vllm/vllm-openai:v0.5.3.post1" in 4.546s (4.546s including waiting). Image size: 5586843591 bytes. - Normal Created 2m25s kubelet Created container inference-server - Normal Started 2m25s kubelet Started container inference-server -``` diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/README.md b/use-cases/inferencing/serving/vllm/gcsfuse/README.md new file mode 100644 index 00000000..1c364962 --- /dev/null +++ b/use-cases/inferencing/serving/vllm/gcsfuse/README.md @@ -0,0 +1,177 @@ +# Distributed Inferencing on vLLM + +There are three common strategies for inference on vLLM: + +- Single GPU (no distributed inference) +- Single-Node Multi-GPU (tensor parallel inference) +- Multi-Node Multi-GPU + +In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE. + +- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. +- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4. + +By the end of this guide, you should be able to perform the following steps: + +- Deploy a vLLM container to your cluster to host your model +- Use vLLM to serve the fine-tuned Gemma model +- View Production metrics for your model serving +- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model + +## Prerequisites + +- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment. +- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md) + +## Preparation + +- Clone the repository + + ```sh + git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \ + cd accelerated-platforms + ``` + +- Change directory to the guide directory + + ```sh + cd use-cases/inferencing/serving/vllm/gcsfuse + ``` + +- Ensure that your `MLP_ENVIRONMENT_FILE` is configured + + ```sh + cat ${MLP_ENVIRONMENT_FILE} && \ + source ${MLP_ENVIRONMENT_FILE} + ``` + + > You should see the various variables populated with the information specific to your environment. + +- Configure the environment + + | Variable | Description | Example | + | --------------- | ---------------------------------------- | ------------ | + | SERVE_KSA | The Kubernetes service account | ml-serve-gcs | + | SERVE_NAMESPACE | Namespace where the model will be served | ml-serve | + + ```sh + SERVE_KSA=ml-serve-gcs + SERVE_NAMESPACE=ml-serve + ``` + +- Get Credentials for the GKE cluster + + ```sh + gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID} + ``` + +- Create the namespace + + ```sh + kubectl create ns ${SERVE_NAMESPACE} + kubectl create sa ${SERVE_KSA} -n ${SERVE_NAMESPACE} + gcloud storage buckets add-iam-policy-binding "gs://${MLP_MODEL_BUCKET}" \ + --member "principal://iam.googleapis.com/projects/${MLP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${MLP_PROJECT_ID}.svc.id.goog/subject/ns/${SERVE_NAMESPACE}/sa/${SERVE_KSA}" \ + --role "roles/storage.objectViewer" + ``` + +## Prepare the Persistent Disk (PD) + +Loading model weights from a PersistentVolume is a method to load models faster. In GKE, PersistentVolumes backed by Google Cloud Persistent Disks can be mounted read-only simultaneously by multiple nodes (ReadOnlyMany), this allows multiple pods access to the model weights from a single volume. + +- Configure the environment + + | Variable | Description | Example | + | ------------- | -------------------------------------------------------------------------------------------- | ------------- | + | ACCELERATOR | Type of GPU accelerator to use (l4, a100, h100) | l4 | + | MODEL_NAME | The name of the model folder in the root of the GCS model bucket | model-gemma2 | + | MODEL_VERSION | The name of the version folder inside the model folder of the GCS model bucket | experiment | + | ZONE | GCP zone where you have accelerators available. The zone must be in the region ${MLP_REGION} | us-central1-a | + + ```sh + ACCELERATOR=l4 + MODEL_NAME=model-gemma2 + MODEL_VERSION=experiment + ZONE=us-central1-a + ``` + +## Serve the model with vLLM + +- Configure the deployment + + ``` + VLLM_IMAGE_NAME="vllm/vllm-openai:v0.6.3.post1" + ``` + + ```sh + sed \ + -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \ + -i -e "s|V_MODEL_NAME|${MODEL_NAME}|" \ + -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \ + -i -e "s|V_IMAGE_NAME|${VLLM_IMAGE_NAME}|" \ + -i -e "s|V_KSA|${SERVE_KSA}|" \ + manifests/model-deployment-${ACCELERATOR}.yaml + ``` + +- Create the deployment + + ``` + kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/model-deployment-${ACCELERATOR}.yaml + ``` + +- Wait for the deployment to be ready + + ```sh + kubectl --namespace ${SERVE_NAMESPACE} wait --for=condition=ready --timeout=900s pod --selector app=vllm-openai-gcs-${ACCELERATOR} + ``` + +## Serve the model through a web chat interface + +- Configure the deployment + + ```sh + sed \ + -i -e "s|V_ACCELERATOR|${ACCELERATOR}|g" \ + -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \ + -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|g" \ + manifests/gradio.yaml + ``` + +- Create the deployment + + ```sh + kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/gradio.yaml + ``` + +- Verify the deployment is ready + +- Access the chat interface + + ```sh + echo -e "\nGradio chat interface: ${MLP_GRADIO_NAMESPACE_ENDPOINT}\n" + ``` + +- Enter the following prompt in the chat text box to get the response from the model. + + ``` + I'm looking for comfortable cycling shorts for women, what are some good options? + ``` + +## Metrics + +vLLM exposes a number of metrics that can be used to monitor the health of the system. For more information about accessing these metrics see [vLLM Metrics](/use-cases/inferencing/serving/vllm/metrics/README.md). + +### Run Batch inference on GKE + +Once a model has completed fine-tuning and is deployed on GKE , you can run batch inference on it. Follow the instructions in [batch-inference readme](/use-cases/inferencing/batch-inference/README.md) to run batch inference. + +### Run benchmarks for inference + +The model is ready to run the benchmarks for inference job. Follow [benchmark readme](/use-cases/inferencing/benchmarks/README.md) to run inference benchmarks on our model. + +### Inference at Scale + +You can configure Horizontal Pod Autoscaler to scale your inference deployment based +on relevant metrics. Follow the instructions on +[inference at scale reademe](./inference-scale/README.md) to scale your +deployed model. diff --git a/use-cases/inferencing/serving-with-vllm/manifests/gradio.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/gradio.yaml similarity index 70% rename from use-cases/inferencing/serving-with-vllm/manifests/gradio.yaml rename to use-cases/inferencing/serving/vllm/gcsfuse/manifests/gradio.yaml index f2926cca..8462fae4 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/gradio.yaml +++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/gradio.yaml @@ -11,13 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +--- apiVersion: apps/v1 kind: Deployment metadata: - name: gradio labels: app: gradio + name: gradio spec: replicas: 1 selector: @@ -29,42 +29,46 @@ spec: app: gradio spec: containers: - - name: gradio - image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3 - resources: - requests: - cpu: "250m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "512Mi" - env: + - env: - name: CONTEXT_PATH - value: "/v1/chat/completions" + value: /v1/chat/completions - name: HOST - value: "http://vllm-openai:8000" + value: http://vllm-openai-gcs-V_ACCELERATOR:8000 - name: DISABLE_SYSTEM_MESSAGE - value: "true" + value: 'true' - name: LLM_ENGINE - value: "openai" + value: openai - name: MODEL_ID - value: V_MODEL_ID + value: /gcs/V_MODEL_NAME/V_MODEL_VERSION - name: USER_PROMPT - value: "user\nprompt\n" + value: | + user + prompt - name: SYSTEM_PROMPT - value: "model\nprompt\n" + value: | + model + prompt + image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3 + name: gradio ports: - containerPort: 7860 + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 512Mi --- apiVersion: v1 kind: Service metadata: name: gradio spec: - selector: - app: gradio ports: - - protocol: TCP - port: 8080 + - port: 8080 + protocol: TCP targetPort: 7860 - type: LoadBalancer + selector: + app: gradio + type: LoadBalancer \ No newline at end of file diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml new file mode 100644 index 00000000..6f4adcce --- /dev/null +++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml @@ -0,0 +1,107 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-openai-gcs-a100 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-openai-gcs-a100 + template: + metadata: + labels: + app: vllm-openai-gcs-a100 + annotations: + gke-gcsfuse/volumes: "true" + spec: + containers: + - name: inference-server + args: + - --model=$(MODEL) + - --tensor-parallel-size=2 + env: + - name: MODEL + value: /gcs/V_MODEL_NAME/V_MODEL_VERSION + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER + image: V_IMAGE_NAME + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 240 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: "2" + memory: "25Gi" + ephemeral-storage: "25Gi" + nvidia.com/gpu: "2" + limits: + cpu: "2" + memory: "25Gi" + ephemeral-storage: "25Gi" + nvidia.com/gpu: "2" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: gcs-fuse-csi-ephemeral + mountPath: /gcs + readOnly: true + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-tesla-a100 + serviceAccountName: V_KSA + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "on-demand" + value: "true" + operator: "Equal" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: V_MODEL_BUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" + fileCacheCapacity: "20Gi" + fileCacheForRangeRead: "true" + metadataStatCacheCapacity: "-1" + metadataTypeCacheCapacity: "-1" + metadataCacheTTLSeconds: "-1" +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-openai-gcs-a100 +spec: + selector: + app: vllm-openai-gcs-a100 + type: ClusterIP + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-h100.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-h100.yaml new file mode 100644 index 00000000..786917db --- /dev/null +++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-h100.yaml @@ -0,0 +1,107 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-openai-gcs-h100 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-openai-gcs-h100 + template: + metadata: + labels: + app: vllm-openai-gcs-h100 + annotations: + gke-gcsfuse/volumes: "true" + spec: + containers: + - name: inference-server + args: + - --model=$(MODEL) + - --tensor-parallel-size=2 + env: + - name: MODEL + value: /gcs/V_MODEL_NAME/V_MODEL_VERSION + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER + image: V_IMAGE_NAME + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 240 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: "2" + memory: "25Gi" + ephemeral-storage: "25Gi" + nvidia.com/gpu: "2" + limits: + cpu: "2" + memory: "25Gi" + ephemeral-storage: "25Gi" + nvidia.com/gpu: "2" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: gcs-fuse-csi-ephemeral + mountPath: /gcs + readOnly: true + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-h100-80gb + serviceAccountName: V_KSA + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "on-demand" + value: "true" + operator: "Equal" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: V_MODEL_BUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" + fileCacheCapacity: "20Gi" + fileCacheForRangeRead: "true" + metadataStatCacheCapacity: "-1" + metadataTypeCacheCapacity: "-1" + metadataCacheTTLSeconds: "-1" +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-openai-gcs-h100 +spec: + selector: + app: vllm-openai-gcs-h100 + type: ClusterIP + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 diff --git a/use-cases/inferencing/serving-with-vllm/manifests/model_deployment.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-l4.yaml similarity index 58% rename from use-cases/inferencing/serving-with-vllm/manifests/model_deployment.yaml rename to use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-l4.yaml index a56206e7..326b1c22 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/model_deployment.yaml +++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-l4.yaml @@ -15,21 +15,40 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-openai + name: vllm-openai-gcs-l4 spec: replicas: 1 selector: matchLabels: - app: vllm-openai + app: vllm-openai-gcs-l4 template: metadata: labels: - app: vllm-openai + app: vllm-openai-gcs-l4 + annotations: + gke-gcsfuse/volumes: "true" spec: - serviceAccountName: default containers: - name: inference-server - image: vllm/vllm-openai:v0.5.3.post1 + args: + - --model=$(MODEL) + - --tensor-parallel-size=2 + env: + - name: MODEL + value: /gcs/V_MODEL_NAME/V_MODEL_VERSION + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER + image: V_IMAGE_NAME + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 240 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 resources: requests: cpu: "2" @@ -41,47 +60,46 @@ spec: memory: "25Gi" ephemeral-storage: "25Gi" nvidia.com/gpu: "2" - args: - - --model=$(MODEL) - - --tensor-parallel-size=2 - env: - - name: MODEL - value: /data/models/V_MODEL_ID/V_MODEL_DIR_PATH - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER volumeMounts: - mountPath: /dev/shm name: dshm - - name: model-disk - mountPath: /data/models - readOnly: true - volumes: - - name: dshm - emptyDir: - medium: Memory - - name: model-disk - persistentVolumeClaim: - claimName: pvc-model-weights-claim-1024gb-zone-a-ro + - name: gcs-fuse-csi-ephemeral + mountPath: /gcs readOnly: true + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 + serviceAccountName: V_KSA tolerations: - key: "nvidia.com/gpu" - operator: "Equal" - value: "present" + operator: "Exists" effect: "NoSchedule" - key: "on-demand" - operator: "Equal" value: "true" - effect: "NoSchedule" - nodeSelector: - cloud.google.com/gke-accelerator: V_ACCELERATOR_TYPE + operator: "Equal" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: V_MODEL_BUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" + fileCacheCapacity: "20Gi" + fileCacheForRangeRead: "true" + metadataStatCacheCapacity: "-1" + metadataTypeCacheCapacity: "-1" + metadataCacheTTLSeconds: "-1" --- apiVersion: v1 kind: Service metadata: - name: vllm-openai + name: vllm-openai-gcs-l4 spec: selector: - app: vllm-openai + app: vllm-openai-gcs-l4 type: ClusterIP ports: - protocol: TCP diff --git a/use-cases/inferencing/serving/vllm/metrics/README.md b/use-cases/inferencing/serving/vllm/metrics/README.md new file mode 100644 index 00000000..f0499252 --- /dev/null +++ b/use-cases/inferencing/serving/vllm/metrics/README.md @@ -0,0 +1,109 @@ +# vLLM Metrics + +vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. These metrics can be scraped using Google Managed Prometheus (GMP) and made available in [Cloud Metrics](https://console.cloud.google.com/monitoring/metrics-explorer). For more details, see [pod monitoring with Google managed prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed#gmp-pod-monitoring). + +## Prerequisites + +- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment. +- A model is deployed using one of the vLLM guides + - [Serving the mode using vLLM and GCSFuse](/use-cases/inferencing/serving/vllm/gcsfuse/README.md) + - [Serving the mode using vLLM and Persistent Disk](/use-cases/inferencing/serving/vllm/persistent-disk/README.md) + +## Preparation + +- Clone the repository and change directory to the guide directory + + ```sh + git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \ + cd accelerated-platforms + ``` + +- Change directory to the guide directory + + ``` + cd use-cases/inferencing/serving/vllm/metrics + METRICS_DIR=$(pwd) + ``` + +- Ensure that your `MLP_ENVIRONMENT_FILE` is configured + + ```sh + cat ${MLP_ENVIRONMENT_FILE} && \ + source ${MLP_ENVIRONMENT_FILE} + ``` + +## Deploy the PodMonitoring resource + +- Configure the environment + + | Variable | Description | Example | + | --------------- | --------------------------------------------- | -------- | + | ACCELERATOR | Type of GPU accelerator used (l4, a100, h100) | l4 | + | V_MODEL_STORAGE | Type of storaged used for the model (gcs, pd) | pd | + | SERVE_NAMESPACE | Namespace where the model will be served | ml-serve | + + ```sh + ACCELERATOR=l4 + MODEL_STORAGE=pd + SERVE_NAMESPACE=ml-serve + ``` + +- Configure the resource + + ```sh + sed \ + -i -e "s|V_ACCELERATOR|${ACCELERATOR}|" \ + -i -e "s|V_MODEL_STORAGE|${MODEL_STORAGE}|" \ + manifests/pod-monitoring.yaml + ``` + +- create the resource + + ```sh + kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/pod-monitoring.yaml + ``` + +## View the metrics + +- Make several requests to your model to populate metrics + +- Wait a minute for the metrics to populate, then you can view the metrics in the Metrics explorer + - Go to the [Metrics explorer](https://console.cloud.google.com/monitoring/metrics-explorer) + - Click the **Select a metric** dropdown near the upper left of the screen + - Select **Prometheus Target** + - Select **Vll**`, you should now see a list of the available metrics. + - Select **Prometheus/vllm:avg_generation_throughput_toks_per_s/gauge** + - Click **Apply** + - Click **Add filter** in the **Filter** text box + - Under **Resource labels** select **cluster** + - For the **value** select the name of your cluster + - You should now see the metrics for your cluster + +## Create a dashboard + +Cloud Monitoring provides an [importer](https://cloud.google.com/monitoring/dashboards/import-grafana-dashboards) that you can use to import dashboard files in the Grafana JSON format into Cloud Monitoring + +- Clone the repository + + ```sh + git clone https://github.com/GoogleCloudPlatform/monitoring-dashboard-samples + ``` + +- Change to the directory for the dashboard importer: + + ```sh + cd monitoring-dashboard-samples/scripts/dashboard-importer + ``` + +The dashboard importer includes the following scripts: + +- `import.sh`, which converts dashboards and optionally uploads the converted dashboards to Cloud Monitoring. +- `upload.sh`, which uploads the converted dashboards or any Monitoring dashboards to Cloud Monitoring. The `import.sh` script calls this script to do the upload. + +- Import the dashboard + + ```sh + ./import.sh ${METRICS_DIR}/grafana/vllm.json ${MLP_PROJECT_ID} + ``` + +- A link to the dashboard will be output by the script, open the link to view the dashboard diff --git a/use-cases/inferencing/serving-with-vllm/configs/grafana.json b/use-cases/inferencing/serving/vllm/metrics/grafana/vllm.json similarity index 100% rename from use-cases/inferencing/serving-with-vllm/configs/grafana.json rename to use-cases/inferencing/serving/vllm/metrics/grafana/vllm.json diff --git a/use-cases/inferencing/serving-with-vllm/manifests/pod_monitoring.yaml b/use-cases/inferencing/serving/vllm/metrics/manifests/pod-monitoring.yaml similarity index 82% rename from use-cases/inferencing/serving-with-vllm/manifests/pod_monitoring.yaml rename to use-cases/inferencing/serving/vllm/metrics/manifests/pod-monitoring.yaml index 93b233b5..d3d2c932 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/pod_monitoring.yaml +++ b/use-cases/inferencing/serving/vllm/metrics/manifests/pod-monitoring.yaml @@ -11,17 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +--- apiVersion: monitoring.googleapis.com/v1 kind: PodMonitoring metadata: - name: vllm-inference + name: vllm-inference-V_MODEL_STORAGE-V_ACCELERATOR labels: - app: vllm-openai + app: vllm-inference-V_MODEL_STORAGE-V_ACCELERATOR spec: selector: matchLabels: - app: vllm-openai + app: vllm-openai-V_MODEL_STORAGE-V_ACCELERATOR endpoints: - port: 8000 path: /metrics diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/README.md b/use-cases/inferencing/serving/vllm/persistent-disk/README.md new file mode 100644 index 00000000..b766992f --- /dev/null +++ b/use-cases/inferencing/serving/vllm/persistent-disk/README.md @@ -0,0 +1,349 @@ +# Distributed Inferencing on vLLM + +There are three common strategies for inference on vLLM: + +- Single GPU (no distributed inference) +- Single-Node Multi-GPU (tensor parallel inference) +- Multi-Node Multi-GPU + +In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE. + +- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. +- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4. + +By the end of this guide, you should be able to perform the following steps: + +- Create a Persistent Disk for the LLM model weights +- Deploy a vLLM container to your cluster to host your model +- Use vLLM to serve the fine-tuned Gemma model +- View Production metrics for your model serving +- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model + +## Prerequisites + +- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment. +- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md) + +## Preparation + +- Clone the repository + + ```sh + git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \ + cd accelerated-platforms + ``` + +- Change directory to the guide directory + + ```sh + cd use-cases/inferencing/serving/vllm/persistent-disk + ``` + +- Ensure that your `MLP_ENVIRONMENT_FILE` is configured + + ```sh + cat ${MLP_ENVIRONMENT_FILE} && \ + source ${MLP_ENVIRONMENT_FILE} + ``` + + > You should see the various variables populated with the information specific to your environment. + +- Configure the environment + + | Variable | Description | Example | + | --------------- | ------------------------------------------------------------------------------------------------- | ----------- | + | OPS_KSA | Kubernetes service account used for operations | ml-ops | + | OPS_NAMESPACE | Namespace where download/upload model artifacts and dataset to and from GCS and artifact registry | ml-ops | + | SERVE_KSA | The Kubernetes service account | ml-serve-pd | + | SERVE_NAMESPACE | Namespace where the model will be served | ml-serve | + + ```sh + OPS_KSA=ml-ops + OPS_NAMESPACE=ml-ops + SERVE_KSA=ml-serve-pd + SERVE_NAMESPACE=ml-serve + ``` + +- Get Credentials for the GKE cluster + + ```sh + gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID} + ``` + +- Grant permission to kubernetes service account in cluster to access the storage bucket to view model weights + + ```sh + kubectl create ns ${OPS_NAMESPACE} + kubectl create sa ${OPS_KSA} -n ${OPS_NAMESPACE} + gcloud storage buckets add-iam-policy-binding "gs://${MLP_MODEL_BUCKET}" \ + --member "principal://iam.googleapis.com/projects/${MLP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${MLP_PROJECT_ID}.svc.id.goog/subject/ns/${OPS_NAMESPACE}/sa/${OPS_KSA}" \ + --role "roles/storage.objectViewer" + + kubectl create ns ${SERVE_NAMESPACE} + kubectl create sa ${SERVE_KSA} -n ${SERVE_NAMESPACE} + ``` + +## Prepare the Persistent Disk (PD) + +Loading model weights from a PersistentVolume is a method to load models faster. In GKE, PersistentVolumes backed by Google Cloud Persistent Disks can be mounted read-only simultaneously by multiple nodes (ReadOnlyMany), this allows multiple pods access to the model weights from a single volume. + +- Configure the environment + + | Variable | Description | Example | + | -------------- | -------------------------------------------------------------------------------------------- | -------------------------- | + | ACCELERATOR | Type of GPU accelerator to use (l4, a100, h100) | l4 | + | GCE_DISK_NAME | Name of the persistent disk that will host the model | vllm-model-weights-${ZONE} | + | GCE_IMAGE_NAME | Disk image created with model weights | vllm-model-weights-${ZONE} | + | MODEL_NAME | The name of the model folder in the root of the GCS model bucket | model-gemma2 | + | MODEL_VERSION | The name of the version folder inside the model folder of the GCS model bucket | experiment | + | ZONE | GCP zone where you have accelerators available. The zone must be in the region ${MLP_REGION} | us-central1-a | + + ```sh + ACCELERATOR=l4 + MODEL_NAME=model-gemma2 + MODEL_VERSION=experiment + ZONE=us-central1-a + ``` + + ```ssh + GCE_DISK_NAME=vllm-model-weights-${ZONE} # TODO: make this unique across environments + GCE_IMAGE_NAME=vllm-model-weights-${ZONE} # TODO: make this unique across environments + ``` + +### Download the model from GCS to a PersistentVolume (PV) + +- Create a PersistentVolumeClaim (PVC) for the model weights + + ```sh + kubectl --namespace ${OPS_NAMESPACE} apply -f manifests/volume-prep/pvc_disk_image.yaml + ``` + +- Configure the job to download the model from the GCS bucket to the PersistentVolume (PV) + + ```sh + sed \ + -i -e "s|V_KSA|${OPS_KSA}|" \ + -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \ + -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \ + -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \ + manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml + ``` + +- Create the job. + + ``` + kubectl --namespace ${OPS_NAMESPACE} create -f manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml + ``` + +- Once the job has started, you can check the pod logs for the progress of the download + + ```sh + POD=$(kubectl --namespace ${OPS_NAMESPACE} get pods --no-headers --output custom-columns=":metadata.name" --selector app=model-downloader) + kubectl --namespace ${OPS_NAMESPACE} logs pod/${POD} + ``` + + If the download is still in progress you should see something similar to: + + ``` + ...... + + Allocating group tables: done + Writing inode tables: done + Creating journal (###### blocks): done + Writing superblocks and filesystem accounting information: done + + ``` + + If the download is complete you should see something similar to: + + ``` + ...... + + Allocating group tables: done + Writing inode tables: done + Creating journal (###### blocks): done + Writing superblocks and filesystem accounting information: done + + + total ##K + drwxr-xr-x 3 root root 4.0K MMM DD HH:MM . + drwxr-xr-x 4 root root 4.0K MMM DD HH:MM .. + drwxr-xr-x 3 root root 4.0K MMM DD HH:MM experiment + total ##G + drwxr-xr-x 3 root root #### MMM DD HH:MM . + drwxr-xr-x 3 root root #### MMM DD HH:MM .. + -rw-r--r-- 1 root root #### MMM DD HH:MM README.md + drwxr-xr-x 4 root root #### MMM DD HH:MM checkpoint-##### + -rw-r--r-- 1 root root #### MMM DD HH:MM config.json + -rw-r--r-- 1 root root #### MMM DD HH:MM generation_config.json + -rw-r--r-- 1 root root #### MMM DD HH:MM model-00001-of-00004.safetensors + -rw-r--r-- 1 root root #### MMM DD HH:MM model-00002-of-00004.safetensors + -rw-r--r-- 1 root root #### MMM DD HH:MM model-00003-of-00004.safetensors + -rw-r--r-- 1 root root #### MMM DD HH:MM model-00004-of-00004.safetensors + -rw-r--r-- 1 root root #### MMM DD HH:MM model.safetensors.index.json + -rw-r--r-- 1 root root #### MMM DD HH:MM special_tokens_map.json + -rw-r--r-- 1 root root #### MMM DD HH:MM tokenizer.json + -rw-r--r-- 1 root root #### MMM DD HH:MM tokenizer_config.json + ``` + +- Wait for the job to complete + + ```sh + kubectl wait --namespace=${OPS_NAMESPACE} --for=condition=complete --timeout=900s job/model-downloader && echo "complete" & + kubectl wait --namespace=${OPS_NAMESPACE} --for=condition=failed --timeout=900s job/model-downloader && echo "failed" && exit 1 & + wait -n && pkill -f "kubectl wait --namespace=${OPS_NAMESPACE}" + ``` + + ``` + job.batch/model-downloader condition met + complete + ``` + + Now, the model is downloaded to the persistent volume. + +### Create the Persistent Disk + +- Fetch the Persistent volume name and disk ref to create a disk image + + ```sh + PV_NAME="$(kubectl --namespace ${OPS_NAMESPACE} get pvc/vllm-models -o jsonpath='{.spec.volumeName}')" + GCE_DISK_REF="$(kubectl --namespace ${OPS_NAMESPACE} get pv/${PV_NAME} -o jsonpath='{.spec.csi.volumeHandle}')" + echo "PV_NAME=${PV_NAME}" + echo "GCE_DISK_REF=${GCE_DISK_REF}" + ``` + +- Create a Compute Engine image + + ```sh + gcloud compute images create ${GCE_IMAGE_NAME} \ + --source-disk="${GCE_DISK_REF}" + ``` + +- Create a Persistent Disk from the image + + ```sh + gcloud compute disks create ${GCE_DISK_NAME} \ + --image=${GCE_IMAGE_NAME} \ + --size=1TiB \ + --type=pd-ssd \ + --zone=${ZONE} + ``` + + > Note: Ensure the appropriate zone based on cluster node location and GPU availability + +### Create the PersistentVolumeClaim (PVC) and PersistentVolume (PV) for serving + +- Configure the PersistentVolume + + ```sh + VOLUME_HANDLE="projects/${MLP_PROJECT_ID}/zones/${ZONE}/disks/${GCE_DISK_NAME}" + echo "VOLUME_HANDLE=${VOLUME_HANDLE}" + sed \ + -i -e "s|V_VOLUME_HANDLE|${VOLUME_HANDLE}|" \ + -i -e "s|V_ZONE|${ZONE}|" \ + manifests/volume-prep/persistent_volume.yaml + ``` + +- Create the PersistentVolume + + ``` + kubectl apply -f manifests/volume-prep/persistent_volume.yaml + ``` + + > Note: PersistenVolumes are cluster-wide resources, meaning they do not belong to any specific namespace. + +- Configure the PersistentVolumeClaim + + ```sh + sed \ + -i -e "s|V_ZONE|${ZONE}|" \ + manifests/volume-prep/persistent_volume_claim.yaml + ``` + +- Create the PersistentVolumeClaim + + ``` + kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/volume-prep/persistent_volume_claim.yaml + ``` + +## Serve the model with vLLM + +- Configure the deployment + + ``` + VLLM_IMAGE_NAME="vllm/vllm-openai:v0.6.3.post1" + ``` + + ```sh + sed \ + -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \ + -i -e "s|V_MODEL_NAME|${MODEL_NAME}|" \ + -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \ + -i -e "s|V_KSA|${SERVE_KSA}|" \ + -i -e "s|V_VLLM_IMAGE_URL|${VLLM_IMAGE_NAME}|" \ + -i -e "s|V_ZONE|${ZONE}|" \ + manifests/model-deployment-${ACCELERATOR}.yaml + ``` + +- Create the deployment + + ``` + kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/model-deployment-${ACCELERATOR}.yaml + ``` + +- Wait for the deployment to be ready + + ```sh + kubectl --namespace ${SERVE_NAMESPACE} wait --for=condition=ready --timeout=900s pod --selector app=vllm-openai-pd-${ACCELERATOR} + ``` + +## Serve the model through a web chat interface + +- Configure the deployment + + ```sh + sed \ + -i -e "s|V_ACCELERATOR|${ACCELERATOR}|" \ + -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \ + -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|g" \ + manifests/gradio.yaml + ``` + +- Create the deployment + + ```sh + kubectl apply -f manifests/gradio.yaml -n ${SERVE_NAMESPACE} + ``` + +- Verify the deployment is ready + +- Access the chat interface + + ```sh + echo -e "\nGradio chat interface: ${MLP_GRADIO_NAMESPACE_ENDPOINT}\n" + ``` + +- Enter the following prompt in the chat text box to get the response from the model. + + ``` + I'm looking for comfortable cycling shorts for women, what are some good options? + ``` + +## Metrics + +vLLM exposes a number of metrics that can be used to monitor the health of the system. For more information about accessing these metrics see [vLLM Metrics](/use-cases/inferencing/serving/vllm/metrics/README.md). + +### Run Batch inference on GKE + +Once a model has completed fine-tuning and is deployed on GKE , you can run batch inference on it. Follow the instructions in [batch-inference readme](./batch-inference/README.md) to run batch inference. + +### Run benchmarks for inference + +The model is ready to run the benchmarks for inference job. Follow [benchmark readme](./benchmarks/README.md) to run inference benchmarks on our model. + +### Inference at Scale + +You can configure Horizontal Pod Autoscaler to scale your inference deployment based +on relevant metrics. Follow the instructions on +[inference at scale reademe](./inference-scale/README.md) to scale your +deployed model. diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/gradio.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/gradio.yaml new file mode 100644 index 00000000..c60fff2d --- /dev/null +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/gradio.yaml @@ -0,0 +1,74 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: gradio + name: gradio +spec: + replicas: 1 + selector: + matchLabels: + app: gradio + template: + metadata: + labels: + app: gradio + spec: + containers: + - env: + - name: CONTEXT_PATH + value: /v1/chat/completions + - name: HOST + value: http://vllm-openai-pd-V_ACCELERATOR:8000 + - name: DISABLE_SYSTEM_MESSAGE + value: 'true' + - name: LLM_ENGINE + value: openai + - name: MODEL_ID + value: /local/V_MODEL_NAME/V_MODEL_VERSION + - name: USER_PROMPT + value: | + user + prompt + - name: SYSTEM_PROMPT + value: | + model + prompt + image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3 + name: gradio + ports: + - containerPort: 7860 + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 512Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: gradio +spec: + ports: + - port: 8080 + protocol: TCP + targetPort: 7860 + selector: + app: gradio + type: LoadBalancer \ No newline at end of file diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-a100.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-a100.yaml new file mode 100644 index 00000000..3a509524 --- /dev/null +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-a100.yaml @@ -0,0 +1,101 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-openai-pd-a100 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-openai-pd-a100 + template: + metadata: + labels: + app: vllm-openai-pd-a100 + spec: + containers: + - args: + - '--model=$(MODEL)' + - '--tensor-parallel-size=2' + env: + - name: MODEL + value: /local/V_MODEL_NAME/V_MODEL_VERSION + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: 1 + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER + image: V_VLLM_IMAGE_URL + name: inference-server + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 240 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: '2' + ephemeral-storage: 25Gi + memory: 25Gi + nvidia.com/gpu: '2' + requests: + cpu: '2' + ephemeral-storage: 25Gi + memory: 25Gi + nvidia.com/gpu: '2' + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /local + name: model-disk + readOnly: true + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-tesla-a100 + serviceAccountName: V_KSA + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: present + - effect: NoSchedule + key: on-demand + operator: Equal + value: 'true' + volumes: + - emptyDir: + medium: Memory + name: dshm + - name: model-disk + persistentVolumeClaim: + claimName: vllm-model-weights-ssd-1024gb-V_ZONE-ro + readOnly: true +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-openai-pd-a100 +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-openai-pd-a100 + type: ClusterIP diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-h100.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-h100.yaml new file mode 100644 index 00000000..0b7d3021 --- /dev/null +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-h100.yaml @@ -0,0 +1,101 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-openai-pd-h100 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-openai-pd-h100 + template: + metadata: + labels: + app: vllm-openai-pd-h100 + spec: + containers: + - args: + - '--model=$(MODEL)' + - '--tensor-parallel-size=2' + env: + - name: MODEL + value: /local/V_MODEL_NAME/V_MODEL_VERSION + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: 1 + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER + image: V_VLLM_IMAGE_URL + name: inference-server + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 240 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: '2' + ephemeral-storage: 25Gi + memory: 25Gi + nvidia.com/gpu: '2' + requests: + cpu: '2' + ephemeral-storage: 25Gi + memory: 25Gi + nvidia.com/gpu: '2' + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /local + name: model-disk + readOnly: true + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-h100-80gb + serviceAccountName: V_KSA + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: present + - effect: NoSchedule + key: on-demand + operator: Equal + value: 'true' + volumes: + - emptyDir: + medium: Memory + name: dshm + - name: model-disk + persistentVolumeClaim: + claimName: vllm-model-weights-ssd-1024gb-V_ZONE-ro + readOnly: true +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-openai-pd-h100 +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-openai-pd-h100 + type: ClusterIP \ No newline at end of file diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-l4.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-l4.yaml new file mode 100644 index 00000000..bccb564e --- /dev/null +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-l4.yaml @@ -0,0 +1,101 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-openai-pd-l4 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-openai-pd-l4 + template: + metadata: + labels: + app: vllm-openai-pd-l4 + spec: + containers: + - args: + - '--model=$(MODEL)' + - '--tensor-parallel-size=2' + env: + - name: MODEL + value: /local/V_MODEL_NAME/V_MODEL_VERSION + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: 1 + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER + image: V_VLLM_IMAGE_URL + name: inference-server + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 240 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: '2' + ephemeral-storage: 25Gi + memory: 25Gi + nvidia.com/gpu: '2' + requests: + cpu: '2' + ephemeral-storage: 25Gi + memory: 25Gi + nvidia.com/gpu: '2' + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /local + name: model-disk + readOnly: true + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 + serviceAccountName: V_KSA + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: present + - effect: NoSchedule + key: on-demand + operator: Equal + value: 'true' + volumes: + - emptyDir: + medium: Memory + name: dshm + - name: model-disk + persistentVolumeClaim: + claimName: vllm-model-weights-ssd-1024gb-V_ZONE-ro + readOnly: true +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-openai-pd-l4 +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-openai-pd-l4 + type: ClusterIP \ No newline at end of file diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml similarity index 77% rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml index c93db8f7..1a2706e8 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml @@ -15,7 +15,7 @@ apiVersion: batch/v1 kind: Job metadata: - generateName: module-download-job- + name: model-downloader labels: app: model-downloader spec: @@ -31,10 +31,12 @@ spec: gke-gcsfuse/volumes: 'true' gke-gcsfuse/memory-limit: 10Gi gke-gcsfuse/memory-request: 4Gi + labels: + app: model-downloader spec: - serviceAccountName: V_KSA # sed replace KSA name + serviceAccountName: V_KSA containers: - - name: dldr + - name: model-downloader image: debian:latest command: ["/bin/sh", "-c"] args: @@ -42,15 +44,16 @@ spec: apt-get update && apt-get install -y fuse2fs mkfs.ext4 -FF /dev/xvda fuse2fs -o fakeroot /dev/xvda /mnt - mkdir -p /mnt/V_MODEL_ID - cp -r /data/models/V_MODEL_ID/V_MODEL_DIR_PATH /mnt/V_MODEL_ID - ls -lR /mnt/V_MODEL_ID + mkdir -p /mnt/V_MODEL_NAME/V_MODEL_VERSION + cp -r /gcs/V_MODEL_NAME/V_MODEL_VERSION /mnt/V_MODEL_NAME/ + ls -alh /mnt/V_MODEL_NAME + ls -alh /mnt/V_MODEL_NAME/V_MODEL_VERSION volumeDevices: - name: data devicePath: /dev/xvda volumeMounts: - - name: model-weights - mountPath: /data/models + - name: vllm-model-weights + mountPath: /gcs securityContext: privileged: true restartPolicy: Never @@ -61,13 +64,13 @@ spec: volumes: - name: data persistentVolumeClaim: - claimName: block-pvc-model - - name: model-weights + claimName: vllm-models + - name: vllm-model-weights csi: driver: gcsfuse.csi.storage.gke.io readOnly: true volumeAttributes: - bucketName: V_MODEL_BUCKET # sed replace with your bucket name + bucketName: V_MODEL_BUCKET mountOptions: implicit-dirs gcsfuseLoggingSeverity: warning fileCacheCapacity: 10Gi diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume.yaml similarity index 93% rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume.yaml rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume.yaml index 6bc6ded0..d6bf1768 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume.yaml +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume.yaml @@ -15,9 +15,9 @@ apiVersion: v1 kind: PersistentVolume metadata: - name: model-weights-disk-1024gb-zone-a + name: vllm-model-weights-ssd-1024gb-V_ZONE labels: - pv-usage: model-weights + pv-usage: vllm-model-weights pv-spec: ssd-1024G spec: storageClassName: "" diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume_claim.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume_claim.yaml similarity index 90% rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume_claim.yaml rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume_claim.yaml index d172d9c3..b01926c9 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume_claim.yaml +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume_claim.yaml @@ -15,7 +15,7 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: pvc-model-weights-claim-1024gb-zone-a-ro + name: vllm-model-weights-ssd-1024gb-V_ZONE-ro spec: storageClassName: "" accessModes: @@ -25,5 +25,5 @@ spec: storage: 1024Gi selector: matchLabels: - pv-usage: model-weights + pv-usage: vllm-model-weights pv-spec: ssd-1024G diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/pvc_disk_image.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/pvc_disk_image.yaml similarity index 96% rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/pvc_disk_image.yaml rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/pvc_disk_image.yaml index 83d73f30..3498f193 100644 --- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/pvc_disk_image.yaml +++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/pvc_disk_image.yaml @@ -15,7 +15,7 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: block-pvc-model + name: vllm-models spec: accessModes: - ReadWriteOnce