diff --git a/.gitignore b/.gitignore
index c2305b35..d1dcf0df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ __pycache__/
 .venv/
 venv/
 
+# Repositories
+monitoring-dashboard-samples/
+
 # Terraform
 *.terraform/
 *.terraform-*/
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/README.md b/use-cases/inferencing/batch-inference/README.md
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/README.md
rename to use-cases/inferencing/batch-inference/README.md
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/example_predictions.txt b/use-cases/inferencing/batch-inference/example_predictions.txt
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/example_predictions.txt
rename to use-cases/inferencing/batch-inference/example_predictions.txt
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/manifests/batch_inference.yaml b/use-cases/inferencing/batch-inference/manifests/batch_inference.yaml
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/manifests/batch_inference.yaml
rename to use-cases/inferencing/batch-inference/manifests/batch_inference.yaml
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/Dockerfile b/use-cases/inferencing/batch-inference/src/Dockerfile
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/Dockerfile
rename to use-cases/inferencing/batch-inference/src/Dockerfile
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/cloudbuild.yaml b/use-cases/inferencing/batch-inference/src/cloudbuild.yaml
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/cloudbuild.yaml
rename to use-cases/inferencing/batch-inference/src/cloudbuild.yaml
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/custom_json_formatter.py b/use-cases/inferencing/batch-inference/src/custom_json_formatter.py
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/custom_json_formatter.py
rename to use-cases/inferencing/batch-inference/src/custom_json_formatter.py
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/logging.conf b/use-cases/inferencing/batch-inference/src/logging.conf
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/logging.conf
rename to use-cases/inferencing/batch-inference/src/logging.conf
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/requirements.txt b/use-cases/inferencing/batch-inference/src/requirements.txt
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/requirements.txt
rename to use-cases/inferencing/batch-inference/src/requirements.txt
diff --git a/use-cases/inferencing/serving-with-vllm/batch-inference/src/run_batch_predictions.py b/use-cases/inferencing/batch-inference/src/run_batch_predictions.py
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/batch-inference/src/run_batch_predictions.py
rename to use-cases/inferencing/batch-inference/src/run_batch_predictions.py
diff --git a/use-cases/inferencing/serving-with-vllm/benchmarks/README.md b/use-cases/inferencing/benchmarks/README.md
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/benchmarks/README.md
rename to use-cases/inferencing/benchmarks/README.md
diff --git a/use-cases/inferencing/serving-with-vllm/benchmarks/locust.jpg b/use-cases/inferencing/benchmarks/locust.jpg
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/benchmarks/locust.jpg
rename to use-cases/inferencing/benchmarks/locust.jpg
diff --git a/use-cases/inferencing/serving-with-vllm/benchmarks/locustfile.py b/use-cases/inferencing/benchmarks/locustfile.py
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/benchmarks/locustfile.py
rename to use-cases/inferencing/benchmarks/locustfile.py
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu_metrics.yaml b/use-cases/inferencing/inference-scale/gpu_metrics.yaml
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu_metrics.yaml
rename to use-cases/inferencing/inference-scale/gpu_metrics.yaml
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/hpa_vllm_openai_batch_size.yaml b/use-cases/inferencing/inference-scale/hpa_vllm_openai_batch_size.yaml
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/hpa_vllm_openai_batch_size.yaml
rename to use-cases/inferencing/inference-scale/hpa_vllm_openai_batch_size.yaml
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/cloud-monitoring-metrics-inference.png b/use-cases/inferencing/inference-scale/images/cloud-monitoring-metrics-inference.png
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/cloud-monitoring-metrics-inference.png
rename to use-cases/inferencing/inference-scale/images/cloud-monitoring-metrics-inference.png
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu-metrics.png b/use-cases/inferencing/inference-scale/images/gpu-metrics.png
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/manifests/inference-scale/gpu-metrics.png
rename to use-cases/inferencing/inference-scale/images/gpu-metrics.png
diff --git a/use-cases/inferencing/serving-with-vllm/README.md b/use-cases/inferencing/serving-with-vllm/README.md
deleted file mode 100644
index 50d548e6..00000000
--- a/use-cases/inferencing/serving-with-vllm/README.md
+++ /dev/null
@@ -1,418 +0,0 @@
-# Distributed Inferencing on vLLM
-
-There are three common strategies for inference on vLLM:
-
-- Single GPU (no distributed inference)
-- Single-Node Multi-GPU (tensor parallel inference)
-- Multi-Node Multi-GPU
-
-In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE.
-
-- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don’t need to use distributed inference. Just use the single GPU to run the inference.
-
-- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4.
-
-By the end of this guide, you should be able to perform the following steps:
-
-- Create a Persistent Disk for the LLM model weights
-- Deploy a vLLM container to your cluster to host your model
-- Use vLLM to serve the fine-tuned Gemma model
-- View Production metrics for your model serving
-- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model
-
-## Prerequisites
-
-- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment.
-- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md)
-
-## Preparation
-
-- Clone the repository and change directory to the guide directory
-
-  ```sh
-  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
-  cd accelerated-platforms/use-cases/inferencing/serving-with-vllm
-  ```
-
-- Ensure that your `MLP_ENVIRONMENT_FILE` is configured
-
-  ```sh
-  cat ${MLP_ENVIRONMENT_FILE} && \
-  source ${MLP_ENVIRONMENT_FILE}
-  ```
-
-  > You should see the various variables populated with the information specific to your environment.
-
-- Set environment variables
-
-  ```sh
-  MLP_PROJECT_ID=<your-project-id>
-  PROJECT_NUMBER=$(gcloud projects describe ${PROJECT_ID} --format="value(projectNumber)")
-  V_MODEL_BUCKET=<model-artifacts-bucket>
-  MLP_CLUSTER_NAME=<your-gke-cluster>
-  SERVE_NAMESPACE=ml-serve # SERVE_NAMESPACE functions to serving the model 
-  OPS_NAMESPACE=ml-ops # OPS_NAMESPACE functions to download/upload model artifacts and dataset to and from GCS and artifact registry
-  MODEL_ID=<your-model-id> # example : model-gemma2-a100
-  MODEL_DIR_PATH ="<LOCAl-PATH-TO-MODEL_DIR>" # location to copy the model artifacts from
-  REGION=<your-region>
-  ZONE=<your-model-disk-image-zone>
-  ACCELERATOR_TYPE=<accelerator_type> # nvidia-l4 | nvidia-tesla-a100
-  ```
-
-- Get Credentials for the GKE cluster
-
-  ```sh
-  gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID}
-  ```
-
-- Grant permission to kubernetes service account in cluster to access the storage bucket to view model weights
-
-  ```sh
-  kubectl create ns ${SERVE_NAMESPACE}
-  kubectl create ns ${OPS_NAMESPACE}
-  kubectl create sa $KSA -n ${OPS_NAMESPACE} # KSA to download model artifacts from GCS
-  gcloud storage buckets add-iam-policy-binding "gs://$V_MODEL_BUCKET" \
-    --member "principal://iam.googleapis.com/projects/"${PROJECT_NUMBER}"/locations/global/workloadIdentityPools/${PROJECT_ID}.svc.id.goog/subject/ns/$OPS_NAMESPACE/sa/$KSA" \
-    --role "roles/storage.objectViewer"
-  ```
-
-- Update the bucket access level to uniform.
-
-  ```sh
-  gcloud storage buckets update "gs://$V_MODEL_BUCKET"  --uniform-bucket-level-access
-  ```
-
-## Create PV, PVC and Persistent disk
-
-Loading model weights from a Persistent Volume is a method to load models faster. In GKE, Persistent Volumes backed by GCP Persistent Disks can be mounted read-only simultaneously by multiple nodes(ReadOnlyMany), this allows multiple pods access the model weights from a single volume.
-
-  Create a PVC for the model weights
-
-  ```sh
-  kubectl apply -f manifests/volume-prep/pvc_disk_image.yaml -n ${OPS_NAMESPACE}
-  ```
-
-  Create a job downloading the models to the volume and review logs for successful completion.
-
-  ```sh
-  sed -i -e "s|V_KSA|${KSA}|" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml 
-  sed -i -e "s|V_MODEL_BUCKET|${MODEL_BUCKET}|" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
-  sed -i -e "s|V_MODEL_ID|${MODEL_ID}|g" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
-  sed -i -e "s|V_MODEL_DIR_PATH|${MODEL_DIR_PATH}|" manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
-  kubectl create -f manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml -n ${OPS_NAMESPACE}
-  ```
-
- Wait for the job to show completion status.
-
-  ```sh
-     kubectl get jobs -n ${OPS_NAMESPACE} --watch
-  ```
-
-  ```
-     NAME                        STATUS    COMPLETIONS   DURATION   AGE
-     module-download-job-vl7cc   Running   0/1           4m1s       4m1s
-     module-download-job-vl7cc   Running   0/1           4m31s      4m31s
-     module-download-job-vl7cc   Running   0/1           4m35s      4m35s
-     module-download-job-vl7cc   Complete   1/1           4m35s      4m35s
-  ```
-
-You can also check pod logs to check the progress of disk creation.
-
-  ```sh
-    kubectl logs module-download-job-vl7cc-km29x -n ${OPS_NAMESPACE} 
-  ```
-
-  ```
-    Creating filesystem with 26214400 4k blocks and 6553600 inodes
-    Filesystem UUID: 8eec47d3-920a-423c-919a-959f016d50cb
-    Superblock backups stored on blocks: 
-      32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, 
-      4096000, 7962624, 11239424, 20480000, 23887872
-
-    Allocating group tables: done                            
-    Writing inode tables: done                            
-    Creating journal (131072 blocks): done
-    Writing superblocks and filesystem accounting information: done
-
-    /mnt/model-gemma2-a100:
-    total 4
-    drwxr-xr-x 3 root root 4096 Oct 25 22:00 experiment-a2aa2c3it1
-    ```
-
- Create the PV and PVC
-
-  ```sh
-  PV_NAME="$(kubectl get pvc/block-pvc-model -n ${OPS_NAMESPACE} -o jsonpath='{.spec.volumeName}')"
-  DISK_REF="$(kubectl get pv "$PV_NAME" -n ${OPS_NAMESPACE} -o jsonpath='{.spec.csi.volumeHandle}')"
-  ```
-
-  ```sh
-  gcloud compute images create model-weights-image --source-disk="$DISK_REF"
-  ```
-
-  ```sh
-  gcloud compute disks create models-fine-tune-disk-v1 --size=1TiB --type=pd-ssd --zone=${ZONE} --image=model-weights-image
-  ```
-  
-  > Note: Choose the appropriate zone based on cluster node location and GPU availability
-
-  ```sh
-  VOLUME_HANDLE="projects/${MLP_PROJECT_ID}/zones/${ZONE}/disks/models-fine-tune-disk-v1"
-  sed -i -e "s|V_VOLUME_HANDLE|${VOLUME_HANDLE}|" manifests/volume-prep/persistent_volume.yaml
-  sed -i -e "s|V_ZONE|${ZONE}|" manifests/volume-prep/persistent_volume.yaml
-  kubectl apply -f manifests/volume-prep/persistent_volume.yaml # PVs are namespace-less
-  kubectl apply -f manifests/volume-prep/persistent_volume_claim.yaml -n ${SERVE_NAMESPACE} # Deploy PVC claim where you serve the model
-  ```
-
-## Deploy a vLLM container to your cluster
-
-- Run the batch job to deploy model using persistent disk on GKE.
-  There are few variables that need to be updated before you can run this job to download model on your persistent disk
-
-
-  Here is an example 
-    
-  ```sh
-  mkdir -p /mnt/model-gemma2-a100
-  cp -r /data/models/model-gemma2-a100/experiment-a2aa2c3it1 /mnt/model-gemma2-a100  # location to copy the model artifacts from
-  ls -lR /mnt/model-gemma2-a100 # location to copy the model artifacts
-  ```
-
-  ```sh
-  sed -i -e "s|V_MODEL_ID|${MODEL_ID}|" manifests/model_deployment.yaml
-  sed -i -e "s|V_MODEL_DIR_PATH|${MODEL_DIR_PATH}|" manifests/model_deployment.yaml
-  sed -i -e "s|V_ACCELERATOR_TYPE|${ACCELERATOR_TYPE}|" manifests/model_deployment.yaml
-  ```
-
-  ```sh
-  kubectl create -f manifests/model_deployment.yaml -n ${SERVE_NAMESPACE}
-  kubectl logs -f -l app=vllm-openai -n ${SERVE_NAMESPACE}
-  ```
-
-  ```sh
-  INFO:     Started server process [1]
-  INFO:     Waiting for application startup.
-  INFO:     Application startup complete.
-  INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
-  ```
-
-## Serve the deployed model through curl and a web chat interface
-
-- Test your deployed model through the CLI
-
-  ```sh
-  kubectl port-forward svc/vllm-openai -n ${SERVE_NAMESPACE} 8000
-  ```
-
-  Run the curl prompt with your values
-
-  ```sh
-  USER_PROMPT="I'm looking for comfortable cycling shorts for women, what are some good options?"
-  MODEL_ID=""
-  ```
-
-  ```
-  curl http://localhost:8000/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "${MODEL_ID}",
-        "messages": [
-            {"role": "user", "content": "${USER_PROMPT}"}],
-             "temperature": 0.70,
-             "top_p": 1.0,
-             "top_k": 1.0,
-             "max_tokens": 256
-    }'  
-  ```
-
-- You can also deploy a gradio chat interface to view the model chat interface. [OPTIONAL]
-  
-  ```sh
-  sed -i -e "s|V_MODEL_ID|${MODEL_ID}|" manifests/gradio.yaml
-  ```
-
-  ```sh
-  kubectl apply -f manifests/gradio.yaml -n ${SERVE_NAMESPACE} 
-  ```
-
-### Production Metrics
-
-vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server.
-
-  ```sh
-  curl http://vllm-openai:8000/metrics
-  ```
-
-### View vLLM serving metrics for your model on GKE
-
-You can configure monitoring of the metrics above using the [pod monitoring](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed#gmp-pod-monitoring)
-
-  ```sh
-  kubectl apply -f manifests/pod_monitoring.yaml -n ${SERVE_NAMESPACE}
-  ```
-
-### Create a dashboard for Cloud Monitoring to view vLLM metrics
-
-Cloud Monitoring provides an [importer](https://cloud.google.com/monitoring/dashboards/import-grafana-dashboards) that you can use to import dashboard files in the Grafana JSON format into Cloud Monitoring
-
- Clone github repository
-
-  ```sh
-  git clone https://github.com/GoogleCloudPlatform/monitoring-dashboard-samples
-  ```
-
- Change to the directory for the dashboard importer:
-
-  ```sh
-  cd monitoring-dashboard-samples/scripts/dashboard-importer
-  ```
-
-The dashboard importer includes the following scripts:
-
-- import.sh, which converts dashboards and optionally uploads the converted dashboards to Cloud Monitoring.
-- upload.sh, which uploads the converted dashboards—or any Monitoring dashboards—to Cloud Monitoring. The import.sh script calls this script to do the upload.
-
- Import the dashboard
-
-  ```sh
-  ./import.sh ./configs/grafana.json ${MLP_PROJECT_ID}
-  ```
-
-  When you use the import.sh script, you must specify the location of the Grafana dashboards to convert. The importer creates a directory that contains the converted dashboards and other information.
-
-
-### Run Batch inference on GKE
-
-Once a model has completed fine-tuning and is deployed on GKE , its ready to run batch Inference pipeline.
-In this example batch inference pipeline, we would first send prompts to the hosted fine-tuned model and then validate the results based on ground truth.
-
-Please follow ```use-cases/inferencing/serving-with-vllm/batch-inference/README.md``` for instructions.
-
-### Run benchmarks for inference
-
-Please follow ```use-cases/inferencing/serving-with-vllm/benchmarks/README.md``` for instructions.
-
-### Inference at Scale
-
-There are different metrics available that could be used to scale your inference workloads on GKE.
-
-Server metrics: LLM inference servers vLLM provides workload-specific performance metrics. GKE simplifies scraping and autoscaling of workloads based on these server-level metrics. You can use these metrics to gain visibility into performance indicators like batch size, queue size, and decode latencies
-
-In case of vLLM, [production metrics class](https://docs.vllm.ai/en/latest/serving/metrics.html) exposes a number of useful metrics which GKE can use to horizontally scale inference workloads.
-
-```sh
-vllm:num_requests_running - Number of requests currently running on GPU.
-vllm:num_requests_waiting - Number of requests waiting to be processed
-```
-
-GPU metrics:
-
-```none
-GPU Utilization (DCGM_FI_DEV_GPU_UTIL) - Measures the duty cycle, which is the amount of time that the GPU is active.
-GPU Memory Usage (DCGM_FI_DEV_FB_USED) - Measures how much GPU memory is being used at a given point in time. This is useful for workloads that implement dynamic allocation of GPU memory.
-```
-
-```
-sh
-kubectl apply -f manifests/inference-scale/hpa_gpu-metrics.yaml -n ${SERVE_NAMESPACE}
-```
-
-Here is a sample metrics graph that represent the bGPU metrics for duty cycle. ![metrics graph](./manifests/inference-scale/gpu-metrics.png) to review.
-
-CPU metrics: Since the inference workloads primarily rely on GPU resources, we don't recommend CPU and memory utilization as the only indicators of the amount of resources a job consumes. Therefore, using CPU metrics alone for autoscaling can lead to suboptimal performance and costs.
-
-HPA is an efficient way to ensure that your model servers scale appropriately with load. Fine-tuning the HPA settings is the primary way to align your provisioned hardware cost with traffic demands to achieve your inference server performance goals.
-
-We recommend setting these HPA configuration options:
-
-- Stabilization window: Use this HPA configuration option to prevent rapid replica count changes due to fluctuating metrics. Defaults are 5 minutes for scale-down (avoiding premature downscaling) and 0 for scale-up (ensuring responsiveness). Adjust the value based on your workload's volatility and your preferred responsiveness.
-
-- Scaling policies: Use this HPA configuration option to fine-tune the scale-up and scale-down behavior. You can set the "Pods" policy limit to specify the absolute number of replicas changed per time unit, and the "Percent" policy limit to specify by the percentage change.
-
-For more details, see Horizontal pod autoscaling in the Google Cloud Managed Service for Prometheus [documentation](https://cloud.google.com/kubernetes-engine/docs/horizontal-pod-autoscaling).
-
-Pre-requisites:
-
-- GKE cluster running inference workload as shown in previous examples.
-- Export the metrics from the vLLM server to Cloud Monitoring as shown in enable monitoring section.
-
-We have couple of options to scale the inference workload on GKE using the HPA and custom metrics adapter.
-
-- Scale pod on the same node as the existing inference workload.
-- Scale pod on the other nodes in the same node pool as the existing inference workload.
-
-#### Prepare your environment to autoscale with HPA metrics
-
-Install the Custom Metrics Adapter. This adapter makes the custom metric that you exported to Cloud Monitoring visible to the HPA. For more details, see HPA in the [Google Cloud Managed Service for Prometheus documentation](https://cloud.google.com/stackdriver/docs/managed-prometheus/hpa).
-
-The following example command shows how to install the adapter:
-
-  ```sh
-  kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml
-  ```
-
-Set up the custom metric-based HPA resource. Deploy an HPA resource that is based on your preferred custom metric.
-
-Here is a sample metrics graph that represent the batch size. ![metrics graph](./manifests/inference-scale/cloud-monitoring-metrics-inference.png) to review.
-
-
-- Batch-size
-
-  ```sh
-  AVERAGE_VALUE = 10 # Replace it with a value of choice 
-  sed -i -e "s|V_AVERAGE_VALUE|${AVERAGE_VALUE}|" manifests/inference-scale/hpa_vllm_openai_batch_size.yaml
-  kubectl apply -f manifests/inference-scale/hpa_vllm_openai_batch_size.yaml -n ${SERVE_NAMESPACE}
-  ```
-
-> Note: Below is an example of the batch size HPA scale test below:
-
-```sh
-kubectl get hpa vllm-openai-hpa -n ${SERVE_NAMESPACE} --watch
-NAME              REFERENCE                TARGETS   MINPODS   MAXPODS   REPLICAS   AGE
-vllm-openai-hpa   Deployment/vllm-openai   0/10      1         5         1          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   13/10     1         5         1          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   17/10     1         5         2          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   12/10     1         5         2          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   17/10     1         5         2          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   14/10     1         5         2          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   17/10     1         5         2          6d16h
-vllm-openai-hpa   Deployment/vllm-openai   10/10     1         5         2          6d16h
-```
-
-```sh
-kubectl get pods -n ${SERVE_NAMESPACE} --watch
-NAME                           READY   STATUS      RESTARTS   AGE
-gradio-6b8698d7b4-88zm7        1/1     Running     0          10d
-model-eval-2sxg2               0/1     Completed   0          8d
-vllm-openai-767b477b77-2jm4v   1/1     Running     0          3d17h
-vllm-openai-767b477b77-82l8v   0/1     Pending     0          9s
-```
-
-Pod scaled up
-```sh
-kubectl get pods -n ${SERVE_NAMESPACE} --watch
-NAME                           READY   STATUS      RESTARTS   AGE
-gradio-6b8698d7b4-88zm7        1/1     Running     0          10d
-model-eval-2sxg2               0/1     Completed   0          8d
-vllm-openai-767b477b77-2jm4v   1/1     Running     0          3d17h
-vllm-openai-767b477b77-82l8v   1/1     Running     0          111s
-```
-
-The new pod is deployed on a node triggered by the cluster autoscaler.
-> NOTE: The existing node where inference workload was deployed in this case had only two GPUS. Hence a new node is required to deploy the copy pod of inference workload.
-
-```sh
-kubectl describe pods vllm-openai-767b477b77-82l8v -n ${SERVE_NAMESPACE}
-
-Events:
-  Type     Reason                  Age    From                                   Message
-  ----     ------                  ----   ----                                   -------
-  Warning  FailedScheduling        4m15s  gke.io/optimize-utilization-scheduler  0/3 nodes are available: 1 Insufficient ephemeral-storage, 1 Insufficient nvidia.com/gpu, 2 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 1 No preemption victims found for incoming pod, 2 Preemption is not helpful for scheduling.
-  Normal   TriggeredScaleUp        4m13s  cluster-autoscaler                     pod triggered scale-up: [{https://www.googleapis.com/compute/v1/projects/gkebatchexpce3c8dcb/zones/us-east4-a/instanceGroups/gke-kh-e2e-l4-2-c399c5c0-grp 1->2 (max: 20)}]
-  Normal   Scheduled               2m40s  gke.io/optimize-utilization-scheduler  Successfully assigned ml-serve/vllm-openai-767b477b77-82l8v to gke-kh-e2e-l4-2-c399c5c0-vvm9
-  Normal   SuccessfulAttachVolume  2m36s  attachdetach-controller                AttachVolume.Attach succeeded for volume "model-weights-disk-1024gb-zone-a"
-  Normal   Pulling                 2m29s  kubelet                                Pulling image "vllm/vllm-openai:v0.5.3.post1"
-  Normal   Pulled                  2m25s  kubelet                                Successfully pulled image "vllm/vllm-openai:v0.5.3.post1" in 4.546s (4.546s including waiting). Image size: 5586843591 bytes.
-  Normal   Created                 2m25s  kubelet                                Created container inference-server
-  Normal   Started                 2m25s  kubelet                                Started container inference-server
-```
diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/README.md b/use-cases/inferencing/serving/vllm/gcsfuse/README.md
new file mode 100644
index 00000000..1c364962
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/gcsfuse/README.md
@@ -0,0 +1,177 @@
+# Distributed Inferencing on vLLM
+
+There are three common strategies for inference on vLLM:
+
+- Single GPU (no distributed inference)
+- Single-Node Multi-GPU (tensor parallel inference)
+- Multi-Node Multi-GPU
+
+In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE.
+
+- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4.
+
+By the end of this guide, you should be able to perform the following steps:
+
+- Deploy a vLLM container to your cluster to host your model
+- Use vLLM to serve the fine-tuned Gemma model
+- View Production metrics for your model serving
+- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model
+
+## Prerequisites
+
+- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment.
+- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md)
+
+## Preparation
+
+- Clone the repository
+
+  ```sh
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms
+  ```
+
+- Change directory to the guide directory
+
+  ```sh
+  cd use-cases/inferencing/serving/vllm/gcsfuse
+  ```
+
+- Ensure that your `MLP_ENVIRONMENT_FILE` is configured
+
+  ```sh
+  cat ${MLP_ENVIRONMENT_FILE} && \
+  source ${MLP_ENVIRONMENT_FILE}
+  ```
+
+  > You should see the various variables populated with the information specific to your environment.
+
+- Configure the environment
+
+  | Variable        | Description                              | Example      |
+  | --------------- | ---------------------------------------- | ------------ |
+  | SERVE_KSA       | The Kubernetes service account           | ml-serve-gcs |
+  | SERVE_NAMESPACE | Namespace where the model will be served | ml-serve     |
+
+  ```sh
+  SERVE_KSA=ml-serve-gcs
+  SERVE_NAMESPACE=ml-serve
+  ```
+
+- Get Credentials for the GKE cluster
+
+  ```sh
+  gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID}
+  ```
+
+- Create the namespace
+
+  ```sh
+  kubectl create ns ${SERVE_NAMESPACE}
+  kubectl create sa ${SERVE_KSA} -n ${SERVE_NAMESPACE}
+  gcloud storage buckets add-iam-policy-binding "gs://${MLP_MODEL_BUCKET}" \
+  --member "principal://iam.googleapis.com/projects/${MLP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${MLP_PROJECT_ID}.svc.id.goog/subject/ns/${SERVE_NAMESPACE}/sa/${SERVE_KSA}" \
+  --role "roles/storage.objectViewer"
+  ```
+
+## Prepare the Persistent Disk (PD)
+
+Loading model weights from a PersistentVolume is a method to load models faster. In GKE, PersistentVolumes backed by Google Cloud Persistent Disks can be mounted read-only simultaneously by multiple nodes (ReadOnlyMany), this allows multiple pods access to the model weights from a single volume.
+
+- Configure the environment
+
+  | Variable      | Description                                                                                  | Example       |
+  | ------------- | -------------------------------------------------------------------------------------------- | ------------- |
+  | ACCELERATOR   | Type of GPU accelerator to use (l4, a100, h100)                                              | l4            |
+  | MODEL_NAME    | The name of the model folder in the root of the GCS model bucket                             | model-gemma2  |
+  | MODEL_VERSION | The name of the version folder inside the model folder of the GCS model bucket               | experiment    |
+  | ZONE          | GCP zone where you have accelerators available. The zone must be in the region ${MLP_REGION} | us-central1-a |
+
+  ```sh
+  ACCELERATOR=l4
+  MODEL_NAME=model-gemma2
+  MODEL_VERSION=experiment
+  ZONE=us-central1-a
+  ```
+
+## Serve the model with vLLM
+
+- Configure the deployment
+
+  ```
+  VLLM_IMAGE_NAME="vllm/vllm-openai:v0.6.3.post1"
+  ```
+
+  ```sh
+  sed \
+  -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \
+  -i -e "s|V_IMAGE_NAME|${VLLM_IMAGE_NAME}|" \
+  -i -e "s|V_KSA|${SERVE_KSA}|" \
+  manifests/model-deployment-${ACCELERATOR}.yaml
+  ```
+
+- Create the deployment
+
+  ```
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/model-deployment-${ACCELERATOR}.yaml
+  ```
+
+- Wait for the deployment to be ready
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} wait --for=condition=ready --timeout=900s pod --selector app=vllm-openai-gcs-${ACCELERATOR}
+  ```
+
+## Serve the model through a web chat interface
+
+- Configure the deployment
+
+  ```sh
+  sed \
+  -i -e "s|V_ACCELERATOR|${ACCELERATOR}|g" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|g" \
+  manifests/gradio.yaml
+  ```
+
+- Create the deployment
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/gradio.yaml
+  ```
+
+- Verify the deployment is ready
+
+- Access the chat interface
+
+  ```sh
+  echo -e "\nGradio chat interface: ${MLP_GRADIO_NAMESPACE_ENDPOINT}\n"
+  ```
+
+- Enter the following prompt in the chat text box to get the response from the model.
+
+  ```
+  I'm looking for comfortable cycling shorts for women, what are some good options?
+  ```
+
+## Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the system. For more information about accessing these metrics see [vLLM Metrics](/use-cases/inferencing/serving/vllm/metrics/README.md).
+
+### Run Batch inference on GKE
+
+Once a model has completed fine-tuning and is deployed on GKE , you can run batch inference on it. Follow the instructions in [batch-inference readme](/use-cases/inferencing/batch-inference/README.md) to run batch inference.
+
+### Run benchmarks for inference
+
+The model is ready to run the benchmarks for inference job. Follow [benchmark readme](/use-cases/inferencing/benchmarks/README.md) to run inference benchmarks on our model.
+
+### Inference at Scale
+
+You can configure Horizontal Pod Autoscaler to scale your inference deployment based
+on relevant metrics. Follow the instructions on
+[inference at scale reademe](./inference-scale/README.md) to scale your
+deployed model.
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/gradio.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/gradio.yaml
similarity index 70%
rename from use-cases/inferencing/serving-with-vllm/manifests/gradio.yaml
rename to use-cases/inferencing/serving/vllm/gcsfuse/manifests/gradio.yaml
index f2926cca..8462fae4 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/gradio.yaml
+++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/gradio.yaml
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: gradio
   labels:
     app: gradio
+  name: gradio
 spec:
   replicas: 1
   selector:
@@ -29,42 +29,46 @@ spec:
         app: gradio
     spec:
       containers:
-      - name: gradio
-        image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3
-        resources:
-          requests:
-            cpu: "250m"
-            memory: "512Mi"
-          limits:
-            cpu: "500m"
-            memory: "512Mi"
-        env:
+      - env:
         - name: CONTEXT_PATH
-          value: "/v1/chat/completions"
+          value: /v1/chat/completions
         - name: HOST
-          value: "http://vllm-openai:8000"
+          value: http://vllm-openai-gcs-V_ACCELERATOR:8000
         - name: DISABLE_SYSTEM_MESSAGE
-          value: "true"
+          value: 'true'
         - name: LLM_ENGINE
-          value: "openai"
+          value: openai
         - name: MODEL_ID
-          value: V_MODEL_ID
+          value: /gcs/V_MODEL_NAME/V_MODEL_VERSION
         - name: USER_PROMPT
-          value: "<start_of_turn>user\nprompt<end_of_turn>\n"
+          value: |
+            <start_of_turn>user
+            prompt<end_of_turn>
         - name: SYSTEM_PROMPT
-          value: "<start_of_turn>model\nprompt<end_of_turn>\n"
+          value: |
+            <start_of_turn>model
+            prompt<end_of_turn>
+        image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3
+        name: gradio
         ports:
         - containerPort: 7860
+        resources:
+          limits:
+            cpu: 500m
+            memory: 512Mi
+          requests:
+            cpu: 250m
+            memory: 512Mi
 ---
 apiVersion: v1
 kind: Service
 metadata:
   name: gradio
 spec:
-  selector:
-    app: gradio
   ports:
-  - protocol: TCP
-    port: 8080
+  - port: 8080
+    protocol: TCP
     targetPort: 7860
-  type: LoadBalancer
+  selector:
+    app: gradio
+  type: LoadBalancer
\ No newline at end of file
diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml
new file mode 100644
index 00000000..6f4adcce
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-a100.yaml
@@ -0,0 +1,107 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-openai-gcs-a100
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-openai-gcs-a100
+  template:
+    metadata:
+      labels:
+        app: vllm-openai-gcs-a100
+      annotations:
+        gke-gcsfuse/volumes: "true"
+    spec:
+      containers:
+      - name: inference-server
+        args:
+        - --model=$(MODEL)
+        - --tensor-parallel-size=2
+        env:
+        - name: MODEL
+          value: /gcs/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_IMAGE_NAME
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          requests:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "25Gi"
+            nvidia.com/gpu: "2"
+          limits:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "25Gi"
+            nvidia.com/gpu: "2"
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - name: gcs-fuse-csi-ephemeral
+          mountPath: /gcs
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-tesla-a100
+      serviceAccountName: V_KSA
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+      - key: "on-demand"
+        value: "true"
+        operator: "Equal"
+        effect: "NoSchedule" 
+      volumes:
+      - name: dshm
+        emptyDir:
+            medium: Memory
+      - name: gcs-fuse-csi-ephemeral
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: V_MODEL_BUCKET
+            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" 
+            fileCacheCapacity: "20Gi"
+            fileCacheForRangeRead: "true"
+            metadataStatCacheCapacity: "-1"
+            metadataTypeCacheCapacity: "-1"
+            metadataCacheTTLSeconds: "-1"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-openai-gcs-a100
+spec:
+  selector:
+    app: vllm-openai-gcs-a100
+  type: ClusterIP
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
diff --git a/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-h100.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-h100.yaml
new file mode 100644
index 00000000..786917db
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-h100.yaml
@@ -0,0 +1,107 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-openai-gcs-h100
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-openai-gcs-h100
+  template:
+    metadata:
+      labels:
+        app: vllm-openai-gcs-h100
+      annotations:
+        gke-gcsfuse/volumes: "true"
+    spec:
+      containers:
+      - name: inference-server
+        args:
+        - --model=$(MODEL)
+        - --tensor-parallel-size=2
+        env:
+        - name: MODEL
+          value: /gcs/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_IMAGE_NAME
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          requests:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "25Gi"
+            nvidia.com/gpu: "2"
+          limits:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "25Gi"
+            nvidia.com/gpu: "2"
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - name: gcs-fuse-csi-ephemeral
+          mountPath: /gcs
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-h100-80gb
+      serviceAccountName: V_KSA
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+      - key: "on-demand"
+        value: "true"
+        operator: "Equal"
+        effect: "NoSchedule" 
+      volumes:
+      - name: dshm
+        emptyDir:
+            medium: Memory
+      - name: gcs-fuse-csi-ephemeral
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: V_MODEL_BUCKET
+            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" 
+            fileCacheCapacity: "20Gi"
+            fileCacheForRangeRead: "true"
+            metadataStatCacheCapacity: "-1"
+            metadataTypeCacheCapacity: "-1"
+            metadataCacheTTLSeconds: "-1"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-openai-gcs-h100
+spec:
+  selector:
+    app: vllm-openai-gcs-h100
+  type: ClusterIP
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/model_deployment.yaml b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-l4.yaml
similarity index 58%
rename from use-cases/inferencing/serving-with-vllm/manifests/model_deployment.yaml
rename to use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-l4.yaml
index a56206e7..326b1c22 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/model_deployment.yaml
+++ b/use-cases/inferencing/serving/vllm/gcsfuse/manifests/model-deployment-l4.yaml
@@ -15,21 +15,40 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-openai
+  name: vllm-openai-gcs-l4
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: vllm-openai
+      app: vllm-openai-gcs-l4
   template:
     metadata:
       labels:
-        app: vllm-openai
+        app: vllm-openai-gcs-l4
+      annotations:
+        gke-gcsfuse/volumes: "true"
     spec:
-      serviceAccountName: default
       containers:
       - name: inference-server
-        image: vllm/vllm-openai:v0.5.3.post1
+        args:
+        - --model=$(MODEL)
+        - --tensor-parallel-size=2
+        env:
+        - name: MODEL
+          value: /gcs/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_IMAGE_NAME
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
         resources:
           requests:
             cpu: "2"
@@ -41,47 +60,46 @@ spec:
             memory: "25Gi"
             ephemeral-storage: "25Gi"
             nvidia.com/gpu: "2"
-        args:
-        - --model=$(MODEL)
-        - --tensor-parallel-size=2
-        env:
-        - name: MODEL
-          value: /data/models/V_MODEL_ID/V_MODEL_DIR_PATH
-        - name: VLLM_ATTENTION_BACKEND
-          value: FLASHINFER
         volumeMounts:
         - mountPath: /dev/shm
           name: dshm
-        - name: model-disk
-          mountPath: /data/models
-          readOnly: true
-      volumes:
-      - name: dshm
-        emptyDir:
-            medium: Memory
-      - name: model-disk
-        persistentVolumeClaim:
-          claimName: pvc-model-weights-claim-1024gb-zone-a-ro
+        - name: gcs-fuse-csi-ephemeral
+          mountPath: /gcs
           readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+      serviceAccountName: V_KSA
       tolerations:
       - key: "nvidia.com/gpu"
-        operator: "Equal"
-        value: "present"
+        operator: "Exists"
         effect: "NoSchedule"
       - key: "on-demand"
-        operator: "Equal"
         value: "true"
-        effect: "NoSchedule"
-      nodeSelector:
-        cloud.google.com/gke-accelerator: V_ACCELERATOR_TYPE
+        operator: "Equal"
+        effect: "NoSchedule" 
+      volumes:
+      - name: dshm
+        emptyDir:
+            medium: Memory
+      - name: gcs-fuse-csi-ephemeral
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: V_MODEL_BUCKET
+            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1" 
+            fileCacheCapacity: "20Gi"
+            fileCacheForRangeRead: "true"
+            metadataStatCacheCapacity: "-1"
+            metadataTypeCacheCapacity: "-1"
+            metadataCacheTTLSeconds: "-1"
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: vllm-openai
+  name: vllm-openai-gcs-l4
 spec:
   selector:
-    app: vllm-openai
+    app: vllm-openai-gcs-l4
   type: ClusterIP
   ports:
   - protocol: TCP
diff --git a/use-cases/inferencing/serving/vllm/metrics/README.md b/use-cases/inferencing/serving/vllm/metrics/README.md
new file mode 100644
index 00000000..f0499252
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/metrics/README.md
@@ -0,0 +1,109 @@
+# vLLM Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. These metrics can be scraped using Google Managed Prometheus (GMP) and made available in [Cloud Metrics](https://console.cloud.google.com/monitoring/metrics-explorer). For more details, see [pod monitoring with Google managed prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed#gmp-pod-monitoring).
+
+## Prerequisites
+
+- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment.
+- A model is deployed using one of the vLLM guides
+  - [Serving the mode using vLLM and GCSFuse](/use-cases/inferencing/serving/vllm/gcsfuse/README.md)
+  - [Serving the mode using vLLM and Persistent Disk](/use-cases/inferencing/serving/vllm/persistent-disk/README.md)
+
+## Preparation
+
+- Clone the repository and change directory to the guide directory
+
+  ```sh
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms
+  ```
+
+- Change directory to the guide directory
+
+  ```
+  cd use-cases/inferencing/serving/vllm/metrics
+  METRICS_DIR=$(pwd)
+  ```
+
+- Ensure that your `MLP_ENVIRONMENT_FILE` is configured
+
+  ```sh
+  cat ${MLP_ENVIRONMENT_FILE} && \
+  source ${MLP_ENVIRONMENT_FILE}
+  ```
+
+## Deploy the PodMonitoring resource
+
+- Configure the environment
+
+  | Variable        | Description                                   | Example  |
+  | --------------- | --------------------------------------------- | -------- |
+  | ACCELERATOR     | Type of GPU accelerator used (l4, a100, h100) | l4       |
+  | V_MODEL_STORAGE | Type of storaged used for the model (gcs, pd) | pd       |
+  | SERVE_NAMESPACE | Namespace where the model will be served      | ml-serve |
+
+  ```sh
+  ACCELERATOR=l4
+  MODEL_STORAGE=pd
+  SERVE_NAMESPACE=ml-serve
+  ```
+
+- Configure the resource
+
+  ```sh
+  sed \
+  -i -e "s|V_ACCELERATOR|${ACCELERATOR}|" \
+  -i -e "s|V_MODEL_STORAGE|${MODEL_STORAGE}|" \
+  manifests/pod-monitoring.yaml
+  ```
+
+- create the resource
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/pod-monitoring.yaml
+  ```
+
+## View the metrics
+
+- Make several requests to your model to populate metrics
+
+- Wait a minute for the metrics to populate, then you can view the metrics in the Metrics explorer
+  - Go to the [Metrics explorer](https://console.cloud.google.com/monitoring/metrics-explorer)
+  - Click the **Select a metric** dropdown near the upper left of the screen
+  - Select **Prometheus Target**
+  - Select **Vll**`, you should now see a list of the available metrics.
+  - Select **Prometheus/vllm:avg_generation_throughput_toks_per_s/gauge**
+  - Click **Apply**
+  - Click **Add filter** in the **Filter** text box
+  - Under **Resource labels** select **cluster**
+  - For the **value** select the name of your cluster
+  - You should now see the metrics for your cluster
+
+## Create a dashboard
+
+Cloud Monitoring provides an [importer](https://cloud.google.com/monitoring/dashboards/import-grafana-dashboards) that you can use to import dashboard files in the Grafana JSON format into Cloud Monitoring
+
+- Clone the repository
+
+  ```sh
+  git clone https://github.com/GoogleCloudPlatform/monitoring-dashboard-samples
+  ```
+
+- Change to the directory for the dashboard importer:
+
+  ```sh
+  cd monitoring-dashboard-samples/scripts/dashboard-importer
+  ```
+
+The dashboard importer includes the following scripts:
+
+- `import.sh`, which converts dashboards and optionally uploads the converted dashboards to Cloud Monitoring.
+- `upload.sh`, which uploads the converted dashboards or any Monitoring dashboards to Cloud Monitoring. The `import.sh` script calls this script to do the upload.
+
+- Import the dashboard
+
+  ```sh
+  ./import.sh ${METRICS_DIR}/grafana/vllm.json ${MLP_PROJECT_ID}
+  ```
+
+- A link to the dashboard will be output by the script, open the link to view the dashboard
diff --git a/use-cases/inferencing/serving-with-vllm/configs/grafana.json b/use-cases/inferencing/serving/vllm/metrics/grafana/vllm.json
similarity index 100%
rename from use-cases/inferencing/serving-with-vllm/configs/grafana.json
rename to use-cases/inferencing/serving/vllm/metrics/grafana/vllm.json
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/pod_monitoring.yaml b/use-cases/inferencing/serving/vllm/metrics/manifests/pod-monitoring.yaml
similarity index 82%
rename from use-cases/inferencing/serving-with-vllm/manifests/pod_monitoring.yaml
rename to use-cases/inferencing/serving/vllm/metrics/manifests/pod-monitoring.yaml
index 93b233b5..d3d2c932 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/pod_monitoring.yaml
+++ b/use-cases/inferencing/serving/vllm/metrics/manifests/pod-monitoring.yaml
@@ -11,17 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+---
 apiVersion: monitoring.googleapis.com/v1
 kind: PodMonitoring
 metadata:
-  name: vllm-inference
+  name: vllm-inference-V_MODEL_STORAGE-V_ACCELERATOR
   labels:
-    app: vllm-openai  
+    app: vllm-inference-V_MODEL_STORAGE-V_ACCELERATOR
 spec:
   selector:
     matchLabels:
-      app: vllm-openai
+      app: vllm-openai-V_MODEL_STORAGE-V_ACCELERATOR
   endpoints:
   - port: 8000
     path: /metrics
diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/README.md b/use-cases/inferencing/serving/vllm/persistent-disk/README.md
new file mode 100644
index 00000000..b766992f
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/README.md
@@ -0,0 +1,349 @@
+# Distributed Inferencing on vLLM
+
+There are three common strategies for inference on vLLM:
+
+- Single GPU (no distributed inference)
+- Single-Node Multi-GPU (tensor parallel inference)
+- Multi-Node Multi-GPU
+
+In this guide, you will serve a fine-tuned Gemma large language model (LLM) using graphical processing units (GPUs) on Google Kubernetes Engine (GKE) with the vLLM serving framework with the above mentioned deployment strategies. You can choose to swap the Gemma model with any other fine-tuned or instruction based model for inference on GKE.
+
+- Single GPU (no distributed inference) - If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- Single-Node Multi-GPU (tensor parallel inference) - If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you need 4 GPUs, you can set the tensor parallel size to 4.
+
+By the end of this guide, you should be able to perform the following steps:
+
+- Create a Persistent Disk for the LLM model weights
+- Deploy a vLLM container to your cluster to host your model
+- Use vLLM to serve the fine-tuned Gemma model
+- View Production metrics for your model serving
+- Use custom metrics and Horizontal Pod Autoscaler (HPA) to scale your model
+
+## Prerequisites
+
+- This guide was developed to be run on the [playground AI/ML platform](/platforms/gke-aiml/playground/README.md). If you are using a different environment the scripts and manifest will need to be modified for that environment.
+- A bucket containing the fine-tuned model from the [Fine-tuning example](/use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/README.md)
+
+## Preparation
+
+- Clone the repository
+
+  ```sh
+  git clone https://github.com/GoogleCloudPlatform/accelerated-platforms && \
+  cd accelerated-platforms
+  ```
+
+- Change directory to the guide directory
+
+  ```sh
+  cd use-cases/inferencing/serving/vllm/persistent-disk
+  ```
+
+- Ensure that your `MLP_ENVIRONMENT_FILE` is configured
+
+  ```sh
+  cat ${MLP_ENVIRONMENT_FILE} && \
+  source ${MLP_ENVIRONMENT_FILE}
+  ```
+
+  > You should see the various variables populated with the information specific to your environment.
+
+- Configure the environment
+
+  | Variable        | Description                                                                                       | Example     |
+  | --------------- | ------------------------------------------------------------------------------------------------- | ----------- |
+  | OPS_KSA         | Kubernetes service account used for operations                                                    | ml-ops      |
+  | OPS_NAMESPACE   | Namespace where download/upload model artifacts and dataset to and from GCS and artifact registry | ml-ops      |
+  | SERVE_KSA       | The Kubernetes service account                                                                    | ml-serve-pd |
+  | SERVE_NAMESPACE | Namespace where the model will be served                                                          | ml-serve    |
+
+  ```sh
+  OPS_KSA=ml-ops
+  OPS_NAMESPACE=ml-ops
+  SERVE_KSA=ml-serve-pd
+  SERVE_NAMESPACE=ml-serve
+  ```
+
+- Get Credentials for the GKE cluster
+
+  ```sh
+  gcloud container fleet memberships get-credentials ${MLP_CLUSTER_NAME} --project ${MLP_PROJECT_ID}
+  ```
+
+- Grant permission to kubernetes service account in cluster to access the storage bucket to view model weights
+
+  ```sh
+  kubectl create ns ${OPS_NAMESPACE}
+  kubectl create sa ${OPS_KSA} -n ${OPS_NAMESPACE}
+  gcloud storage buckets add-iam-policy-binding "gs://${MLP_MODEL_BUCKET}" \
+  --member "principal://iam.googleapis.com/projects/${MLP_PROJECT_NUMBER}/locations/global/workloadIdentityPools/${MLP_PROJECT_ID}.svc.id.goog/subject/ns/${OPS_NAMESPACE}/sa/${OPS_KSA}" \
+  --role "roles/storage.objectViewer"
+
+  kubectl create ns ${SERVE_NAMESPACE}
+  kubectl create sa ${SERVE_KSA} -n ${SERVE_NAMESPACE}
+  ```
+
+## Prepare the Persistent Disk (PD)
+
+Loading model weights from a PersistentVolume is a method to load models faster. In GKE, PersistentVolumes backed by Google Cloud Persistent Disks can be mounted read-only simultaneously by multiple nodes (ReadOnlyMany), this allows multiple pods access to the model weights from a single volume.
+
+- Configure the environment
+
+  | Variable       | Description                                                                                  | Example                    |
+  | -------------- | -------------------------------------------------------------------------------------------- | -------------------------- |
+  | ACCELERATOR    | Type of GPU accelerator to use (l4, a100, h100)                                              | l4                         |
+  | GCE_DISK_NAME  | Name of the persistent disk that will host the model                                         | vllm-model-weights-${ZONE} |
+  | GCE_IMAGE_NAME | Disk image created with model weights                                                        | vllm-model-weights-${ZONE} |
+  | MODEL_NAME     | The name of the model folder in the root of the GCS model bucket                             | model-gemma2               |
+  | MODEL_VERSION  | The name of the version folder inside the model folder of the GCS model bucket               | experiment                 |
+  | ZONE           | GCP zone where you have accelerators available. The zone must be in the region ${MLP_REGION} | us-central1-a              |
+
+  ```sh
+  ACCELERATOR=l4
+  MODEL_NAME=model-gemma2
+  MODEL_VERSION=experiment
+  ZONE=us-central1-a
+  ```
+
+  ```ssh
+  GCE_DISK_NAME=vllm-model-weights-${ZONE}   # TODO: make this unique across environments
+  GCE_IMAGE_NAME=vllm-model-weights-${ZONE}  # TODO: make this unique across environments
+  ```
+
+### Download the model from GCS to a PersistentVolume (PV)
+
+- Create a PersistentVolumeClaim (PVC) for the model weights
+
+  ```sh
+  kubectl --namespace ${OPS_NAMESPACE} apply -f manifests/volume-prep/pvc_disk_image.yaml
+  ```
+
+- Configure the job to download the model from the GCS bucket to the PersistentVolume (PV)
+
+  ```sh
+  sed \
+  -i -e "s|V_KSA|${OPS_KSA}|" \
+  -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \
+  manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
+  ```
+
+- Create the job.
+
+  ```
+  kubectl --namespace ${OPS_NAMESPACE} create -f manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
+  ```
+
+- Once the job has started, you can check the pod logs for the progress of the download
+
+  ```sh
+  POD=$(kubectl --namespace ${OPS_NAMESPACE} get pods --no-headers --output custom-columns=":metadata.name" --selector app=model-downloader)
+  kubectl --namespace ${OPS_NAMESPACE} logs pod/${POD}
+  ```
+
+  If the download is still in progress you should see something similar to:
+
+  ```
+  ...<skipped output>...
+
+  Allocating group tables: done
+  Writing inode tables: done
+  Creating journal (###### blocks): done
+  Writing superblocks and filesystem accounting information: done
+
+  ```
+
+  If the download is complete you should see something similar to:
+
+  ```
+  ...<skipped output>...
+
+  Allocating group tables: done
+  Writing inode tables: done
+  Creating journal (###### blocks): done
+  Writing superblocks and filesystem accounting information: done
+
+
+  total ##K
+  drwxr-xr-x 3 root root 4.0K MMM DD HH:MM .
+  drwxr-xr-x 4 root root 4.0K MMM DD HH:MM ..
+  drwxr-xr-x 3 root root 4.0K MMM DD HH:MM experiment
+  total ##G
+  drwxr-xr-x 3 root root #### MMM DD HH:MM .
+  drwxr-xr-x 3 root root #### MMM DD HH:MM ..
+  -rw-r--r-- 1 root root #### MMM DD HH:MM README.md
+  drwxr-xr-x 4 root root #### MMM DD HH:MM checkpoint-#####
+  -rw-r--r-- 1 root root #### MMM DD HH:MM config.json
+  -rw-r--r-- 1 root root #### MMM DD HH:MM generation_config.json
+  -rw-r--r-- 1 root root #### MMM DD HH:MM model-00001-of-00004.safetensors
+  -rw-r--r-- 1 root root #### MMM DD HH:MM model-00002-of-00004.safetensors
+  -rw-r--r-- 1 root root #### MMM DD HH:MM model-00003-of-00004.safetensors
+  -rw-r--r-- 1 root root #### MMM DD HH:MM model-00004-of-00004.safetensors
+  -rw-r--r-- 1 root root #### MMM DD HH:MM model.safetensors.index.json
+  -rw-r--r-- 1 root root #### MMM DD HH:MM special_tokens_map.json
+  -rw-r--r-- 1 root root #### MMM DD HH:MM tokenizer.json
+  -rw-r--r-- 1 root root #### MMM DD HH:MM tokenizer_config.json
+  ```
+
+- Wait for the job to complete
+
+  ```sh
+  kubectl wait --namespace=${OPS_NAMESPACE} --for=condition=complete --timeout=900s job/model-downloader && echo "complete" &
+  kubectl wait --namespace=${OPS_NAMESPACE} --for=condition=failed --timeout=900s job/model-downloader && echo "failed" && exit 1 &
+  wait -n && pkill -f "kubectl wait --namespace=${OPS_NAMESPACE}"
+  ```
+
+  ```
+  job.batch/model-downloader condition met
+  complete
+  ```
+
+  Now, the model is downloaded to the persistent volume.
+
+### Create the Persistent Disk
+
+- Fetch the Persistent volume name and disk ref to create a disk image
+
+  ```sh
+  PV_NAME="$(kubectl --namespace ${OPS_NAMESPACE} get pvc/vllm-models -o jsonpath='{.spec.volumeName}')"
+  GCE_DISK_REF="$(kubectl --namespace ${OPS_NAMESPACE} get pv/${PV_NAME} -o jsonpath='{.spec.csi.volumeHandle}')"
+  echo "PV_NAME=${PV_NAME}"
+  echo "GCE_DISK_REF=${GCE_DISK_REF}"
+  ```
+
+- Create a Compute Engine image
+
+  ```sh
+  gcloud compute images create ${GCE_IMAGE_NAME} \
+  --source-disk="${GCE_DISK_REF}"
+  ```
+
+- Create a Persistent Disk from the image
+
+  ```sh
+  gcloud compute disks create ${GCE_DISK_NAME} \
+  --image=${GCE_IMAGE_NAME} \
+  --size=1TiB \
+  --type=pd-ssd \
+  --zone=${ZONE}
+  ```
+
+  > Note: Ensure the appropriate zone based on cluster node location and GPU availability
+
+### Create the PersistentVolumeClaim (PVC) and PersistentVolume (PV) for serving
+
+- Configure the PersistentVolume
+
+  ```sh
+  VOLUME_HANDLE="projects/${MLP_PROJECT_ID}/zones/${ZONE}/disks/${GCE_DISK_NAME}"
+  echo "VOLUME_HANDLE=${VOLUME_HANDLE}"
+  sed \
+  -i -e "s|V_VOLUME_HANDLE|${VOLUME_HANDLE}|" \
+  -i -e "s|V_ZONE|${ZONE}|" \
+  manifests/volume-prep/persistent_volume.yaml
+  ```
+
+- Create the PersistentVolume
+
+  ```
+  kubectl apply -f manifests/volume-prep/persistent_volume.yaml
+  ```
+
+  > Note: PersistenVolumes are cluster-wide resources, meaning they do not belong to any specific namespace.
+
+- Configure the PersistentVolumeClaim
+
+  ```sh
+  sed \
+  -i -e "s|V_ZONE|${ZONE}|" \
+  manifests/volume-prep/persistent_volume_claim.yaml
+  ```
+
+- Create the PersistentVolumeClaim
+
+  ```
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/volume-prep/persistent_volume_claim.yaml
+  ```
+
+## Serve the model with vLLM
+
+- Configure the deployment
+
+  ```
+  VLLM_IMAGE_NAME="vllm/vllm-openai:v0.6.3.post1"
+  ```
+
+  ```sh
+  sed \
+  -i -e "s|V_MODEL_BUCKET|${MLP_MODEL_BUCKET}|" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|" \
+  -i -e "s|V_KSA|${SERVE_KSA}|" \
+  -i -e "s|V_VLLM_IMAGE_URL|${VLLM_IMAGE_NAME}|" \
+  -i -e "s|V_ZONE|${ZONE}|" \
+  manifests/model-deployment-${ACCELERATOR}.yaml
+  ```
+
+- Create the deployment
+
+  ```
+  kubectl --namespace ${SERVE_NAMESPACE} apply -f manifests/model-deployment-${ACCELERATOR}.yaml
+  ```
+
+- Wait for the deployment to be ready
+
+  ```sh
+  kubectl --namespace ${SERVE_NAMESPACE} wait --for=condition=ready --timeout=900s pod --selector app=vllm-openai-pd-${ACCELERATOR}
+  ```
+
+## Serve the model through a web chat interface
+
+- Configure the deployment
+
+  ```sh
+  sed \
+  -i -e "s|V_ACCELERATOR|${ACCELERATOR}|" \
+  -i -e "s|V_MODEL_NAME|${MODEL_NAME}|g" \
+  -i -e "s|V_MODEL_VERSION|${MODEL_VERSION}|g" \
+  manifests/gradio.yaml
+  ```
+
+- Create the deployment
+
+  ```sh
+  kubectl apply -f manifests/gradio.yaml -n ${SERVE_NAMESPACE}
+  ```
+
+- Verify the deployment is ready
+
+- Access the chat interface
+
+  ```sh
+  echo -e "\nGradio chat interface: ${MLP_GRADIO_NAMESPACE_ENDPOINT}\n"
+  ```
+
+- Enter the following prompt in the chat text box to get the response from the model.
+
+  ```
+  I'm looking for comfortable cycling shorts for women, what are some good options?
+  ```
+
+## Metrics
+
+vLLM exposes a number of metrics that can be used to monitor the health of the system. For more information about accessing these metrics see [vLLM Metrics](/use-cases/inferencing/serving/vllm/metrics/README.md).
+
+### Run Batch inference on GKE
+
+Once a model has completed fine-tuning and is deployed on GKE , you can run batch inference on it. Follow the instructions in [batch-inference readme](./batch-inference/README.md) to run batch inference.
+
+### Run benchmarks for inference
+
+The model is ready to run the benchmarks for inference job. Follow [benchmark readme](./benchmarks/README.md) to run inference benchmarks on our model.
+
+### Inference at Scale
+
+You can configure Horizontal Pod Autoscaler to scale your inference deployment based
+on relevant metrics. Follow the instructions on
+[inference at scale reademe](./inference-scale/README.md) to scale your
+deployed model.
diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/gradio.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/gradio.yaml
new file mode 100644
index 00000000..c60fff2d
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/gradio.yaml
@@ -0,0 +1,74 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: gradio
+  name: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+      - env:
+        - name: CONTEXT_PATH
+          value: /v1/chat/completions
+        - name: HOST
+          value: http://vllm-openai-pd-V_ACCELERATOR:8000
+        - name: DISABLE_SYSTEM_MESSAGE
+          value: 'true'
+        - name: LLM_ENGINE
+          value: openai
+        - name: MODEL_ID
+          value: /local/V_MODEL_NAME/V_MODEL_VERSION
+        - name: USER_PROMPT
+          value: |
+            <start_of_turn>user
+            prompt<end_of_turn>
+        - name: SYSTEM_PROMPT
+          value: |
+            <start_of_turn>model
+            prompt<end_of_turn>
+        image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.3
+        name: gradio
+        ports:
+        - containerPort: 7860
+        resources:
+          limits:
+            cpu: 500m
+            memory: 512Mi
+          requests:
+            cpu: 250m
+            memory: 512Mi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio
+spec:
+  ports:
+  - port: 8080
+    protocol: TCP
+    targetPort: 7860
+  selector:
+    app: gradio
+  type: LoadBalancer
\ No newline at end of file
diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-a100.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-a100.yaml
new file mode 100644
index 00000000..3a509524
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-a100.yaml
@@ -0,0 +1,101 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-openai-pd-a100
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-openai-pd-a100
+  template:
+    metadata:
+      labels:
+        app: vllm-openai-pd-a100
+    spec:
+      containers:
+      - args:
+        - '--model=$(MODEL)'
+        - '--tensor-parallel-size=2'
+        env:
+        - name: MODEL
+          value: /local/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: 1
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_VLLM_IMAGE_URL
+        name: inference-server
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          limits:
+            cpu: '2'
+            ephemeral-storage: 25Gi
+            memory: 25Gi
+            nvidia.com/gpu: '2'
+          requests:
+            cpu: '2'
+            ephemeral-storage: 25Gi
+            memory: 25Gi
+            nvidia.com/gpu: '2'
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /local
+          name: model-disk
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-tesla-a100
+      serviceAccountName: V_KSA
+      tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Equal
+        value: present
+      - effect: NoSchedule
+        key: on-demand
+        operator: Equal
+        value: 'true'
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - name: model-disk
+        persistentVolumeClaim:
+          claimName: vllm-model-weights-ssd-1024gb-V_ZONE-ro
+          readOnly: true
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-openai-pd-a100
+spec:
+  ports:
+  - port: 8000
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: vllm-openai-pd-a100
+  type: ClusterIP
diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-h100.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-h100.yaml
new file mode 100644
index 00000000..0b7d3021
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-h100.yaml
@@ -0,0 +1,101 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-openai-pd-h100
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-openai-pd-h100
+  template:
+    metadata:
+      labels:
+        app: vllm-openai-pd-h100
+    spec:
+      containers:
+      - args:
+        - '--model=$(MODEL)'
+        - '--tensor-parallel-size=2'
+        env:
+        - name: MODEL
+          value: /local/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: 1
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_VLLM_IMAGE_URL
+        name: inference-server
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          limits:
+            cpu: '2'
+            ephemeral-storage: 25Gi
+            memory: 25Gi
+            nvidia.com/gpu: '2'
+          requests:
+            cpu: '2'
+            ephemeral-storage: 25Gi
+            memory: 25Gi
+            nvidia.com/gpu: '2'
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /local
+          name: model-disk
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-h100-80gb
+      serviceAccountName: V_KSA
+      tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Equal
+        value: present
+      - effect: NoSchedule
+        key: on-demand
+        operator: Equal
+        value: 'true'
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - name: model-disk
+        persistentVolumeClaim:
+          claimName: vllm-model-weights-ssd-1024gb-V_ZONE-ro
+          readOnly: true
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-openai-pd-h100
+spec:
+  ports:
+  - port: 8000
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: vllm-openai-pd-h100
+  type: ClusterIP
\ No newline at end of file
diff --git a/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-l4.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-l4.yaml
new file mode 100644
index 00000000..bccb564e
--- /dev/null
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/model-deployment-l4.yaml
@@ -0,0 +1,101 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-openai-pd-l4
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-openai-pd-l4
+  template:
+    metadata:
+      labels:
+        app: vllm-openai-pd-l4
+    spec:
+      containers:
+      - args:
+        - '--model=$(MODEL)'
+        - '--tensor-parallel-size=2'
+        env:
+        - name: MODEL
+          value: /local/V_MODEL_NAME/V_MODEL_VERSION
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: 1
+        - name: VLLM_ATTENTION_BACKEND
+          value: FLASHINFER
+        image: V_VLLM_IMAGE_URL
+        name: inference-server
+        readinessProbe:
+          failureThreshold: 3
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTP
+          initialDelaySeconds: 240
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 1
+        resources:
+          limits:
+            cpu: '2'
+            ephemeral-storage: 25Gi
+            memory: 25Gi
+            nvidia.com/gpu: '2'
+          requests:
+            cpu: '2'
+            ephemeral-storage: 25Gi
+            memory: 25Gi
+            nvidia.com/gpu: '2'
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /local
+          name: model-disk
+          readOnly: true
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+      serviceAccountName: V_KSA
+      tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Equal
+        value: present
+      - effect: NoSchedule
+        key: on-demand
+        operator: Equal
+        value: 'true'
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - name: model-disk
+        persistentVolumeClaim:
+          claimName: vllm-model-weights-ssd-1024gb-V_ZONE-ro
+          readOnly: true
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-openai-pd-l4
+spec:
+  ports:
+  - port: 8000
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: vllm-openai-pd-l4
+  type: ClusterIP
\ No newline at end of file
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
similarity index 77%
rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
index c93db8f7..1a2706e8 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/batch_job_download_model_on_pv_volume.yaml
@@ -15,7 +15,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  generateName: module-download-job-
+  name: model-downloader
   labels:
     app: model-downloader
 spec:
@@ -31,10 +31,12 @@ spec:
         gke-gcsfuse/volumes: 'true'
         gke-gcsfuse/memory-limit: 10Gi
         gke-gcsfuse/memory-request: 4Gi
+      labels:
+        app: model-downloader
     spec:
-      serviceAccountName: V_KSA # sed replace KSA name
+      serviceAccountName: V_KSA
       containers:
-      - name: dldr
+      - name: model-downloader
         image: debian:latest
         command: ["/bin/sh", "-c"]
         args:
@@ -42,15 +44,16 @@ spec:
           apt-get update && apt-get install -y fuse2fs
           mkfs.ext4 -FF /dev/xvda
           fuse2fs -o fakeroot /dev/xvda /mnt
-          mkdir -p /mnt/V_MODEL_ID
-          cp -r /data/models/V_MODEL_ID/V_MODEL_DIR_PATH /mnt/V_MODEL_ID
-          ls -lR /mnt/V_MODEL_ID
+          mkdir -p /mnt/V_MODEL_NAME/V_MODEL_VERSION
+          cp -r /gcs/V_MODEL_NAME/V_MODEL_VERSION /mnt/V_MODEL_NAME/
+          ls -alh /mnt/V_MODEL_NAME
+          ls -alh /mnt/V_MODEL_NAME/V_MODEL_VERSION
         volumeDevices:
         - name: data
           devicePath: /dev/xvda
         volumeMounts:
-        - name: model-weights
-          mountPath: /data/models
+        - name: vllm-model-weights
+          mountPath: /gcs
         securityContext:
           privileged: true
       restartPolicy: Never
@@ -61,13 +64,13 @@ spec:
       volumes:
       - name: data
         persistentVolumeClaim:
-          claimName: block-pvc-model
-      - name: model-weights
+          claimName: vllm-models
+      - name: vllm-model-weights
         csi:
           driver: gcsfuse.csi.storage.gke.io
           readOnly: true
           volumeAttributes:
-            bucketName: V_MODEL_BUCKET # sed replace with your bucket name  
+            bucketName: V_MODEL_BUCKET
             mountOptions: implicit-dirs
             gcsfuseLoggingSeverity: warning
             fileCacheCapacity: 10Gi
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume.yaml
similarity index 93%
rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume.yaml
rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume.yaml
index 6bc6ded0..d6bf1768 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume.yaml
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume.yaml
@@ -15,9 +15,9 @@
 apiVersion: v1
 kind: PersistentVolume
 metadata:
-  name: model-weights-disk-1024gb-zone-a
+  name: vllm-model-weights-ssd-1024gb-V_ZONE
   labels:
-    pv-usage: model-weights
+    pv-usage: vllm-model-weights
     pv-spec: ssd-1024G
 spec:
   storageClassName: ""
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume_claim.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume_claim.yaml
similarity index 90%
rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume_claim.yaml
rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume_claim.yaml
index d172d9c3..b01926c9 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/persistent_volume_claim.yaml
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/persistent_volume_claim.yaml
@@ -15,7 +15,7 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: pvc-model-weights-claim-1024gb-zone-a-ro
+  name: vllm-model-weights-ssd-1024gb-V_ZONE-ro
 spec:
   storageClassName: ""
   accessModes:
@@ -25,5 +25,5 @@ spec:
       storage: 1024Gi
   selector:
     matchLabels:
-      pv-usage: model-weights
+      pv-usage: vllm-model-weights
       pv-spec: ssd-1024G
diff --git a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/pvc_disk_image.yaml b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/pvc_disk_image.yaml
similarity index 96%
rename from use-cases/inferencing/serving-with-vllm/manifests/volume-prep/pvc_disk_image.yaml
rename to use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/pvc_disk_image.yaml
index 83d73f30..3498f193 100644
--- a/use-cases/inferencing/serving-with-vllm/manifests/volume-prep/pvc_disk_image.yaml
+++ b/use-cases/inferencing/serving/vllm/persistent-disk/manifests/volume-prep/pvc_disk_image.yaml
@@ -15,7 +15,7 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: block-pvc-model
+  name: vllm-models
 spec:
   accessModes:
   - ReadWriteOnce