Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
280 changes: 280 additions & 0 deletions .github/workflows/e2e-decode-heavy-gke.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
name: GKE Decode Heavy Test

on:
# Runs with a PR comment /run-gke-decode-heavy
issue_comment:
types: [created]
workflow_dispatch:
inputs:
pr_or_branch:
description: 'Pull-request number or branch name to test'
required: true
default: 'main'
type: string

permissions:
contents: read

jobs:
deploy_and_validate:
if: >
github.event_name == 'workflow_dispatch' ||
(
github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
github.event.issue.pull_request.base.ref == 'main' &&
contains(github.event.comment.body, '/run-gke-decode-heavy')
&&
(
github.event.comment.author_association == 'OWNER' ||
github.event.comment.author_association == 'MEMBER' ||
github.event.comment.author_association == 'COLLABORATOR'
)
)
name: Test on ${{ matrix.accelerator.name }}
runs-on: ubuntu-latest

strategy:
fail-fast: false
max-parallel: 1
matrix:
accelerator:
- name: GPU

env:
GCP_PROJECT_ID: llm-d-scale
GKE_CLUSTER_NAME: llm-d-e2e-us-east5
GKE_CLUSTER_ZONE: us-east5
NAMESPACE: igw-decode-heavy
GATEWAY: gke-l7-regional-external-managed
GATEWAY_TYPE: gke
PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch || github.event.issue.number || github.event.number || 'actions' }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
MODEL: meta-llama/Llama-3.1-8B-Instruct
GSA_EMAIL: ${{ secrets.GCS_WORKLOAD_SA }}
GCS_BUCKET: igw-e2e-benchmark-results
KSA_NAME: igw-e2e-benchmark-sa

steps:
- name: Checkout
uses: actions/checkout@v4
with:
persist-credentials: false

- name: Determine if pr_or_branch is a PR number
id: check_pr
env:
PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch }}
shell: bash
run: |
echo "PR_OR_BRANCH=${PR_OR_BRANCH:-actions}" >> "$GITHUB_ENV"
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
echo "is_pr=true" >> "$GITHUB_OUTPUT"
elif [[ "${{ github.event_name }}" = "pull_request" ]]; then
echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
echo "is_pr=true" >> "$GITHUB_OUTPUT"
else
echo "is_pr=false" >> "$GITHUB_OUTPUT"
fi

- name: Fetch and checkout PR
if: steps.check_pr.outputs.is_pr == 'true'
run: |
git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH"
git checkout pr-"$PR_OR_BRANCH"

- name: Checkout branch
if: steps.check_pr.outputs.is_pr == 'false'
run: git checkout "$PR_OR_BRANCH"

- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}

- name: Set up gcloud CLI and kubectl
uses: google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397
with:
project_id: ${{ env.GCP_PROJECT_ID }}
install_components: 'kubectl,gke-gcloud-auth-plugin'

- name: Get GKE credentials
run: |
gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"

- name: Create namespace
run: |
kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists"

- name: Create hf-token secret
run: |
kubectl create secret generic hf-token \
--from-literal="token=${{ secrets.HF_TOKEN }}" \
--namespace "${NAMESPACE}" \
--dry-run=client -o yaml | kubectl apply -f -

- name: Create and Annotate KSA for Workload Identity
run: |
kubectl create serviceaccount $KSA_NAME --namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
kubectl annotate serviceaccount $KSA_NAME \
iam.gke.io/gcp-service-account=$GSA_EMAIL \
--overwrite \
--namespace "${NAMESPACE}"

- name: Deploy Model Server and CRDs
run: |
cd config/manifests/vllm
echo "Deploying Model Server..."
kubectl apply -f gpu-deployment.yaml -n ${NAMESPACE} | tee ~/igw-decode-heavy-deployment.log
echo "Installing CRDs"
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml
echo "---------------------------------------" >> ~/igw-decode-heavy-deployment.log

- name: Deploy InferencePool and Endpoint Picker Extension
run: |
export IGW_CHART_VERSION=v1.1.0
helm install vllm-llama3-8b-instruct \
--namespace $NAMESPACE \
--set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
--set provider.name=$GATEWAY_TYPE \
--version $IGW_CHART_VERSION \
oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool | tee ~/igw-decode-heavy-deployment.log
echo "---------------------------------------" >> ~/igw-decode-heavy-deployment.log

- name: Wait for all pods to be ready
run: |
kubectl wait pod \
--for=condition=Ready \
--all \
-n "${NAMESPACE}" \
--timeout=25m
echo "✅ All pods are ready."
kubectl get pods -n "${NAMESPACE}"

- name: Deploy Gateway
run: |
GATEWAY_NAME=inference-gateway
kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
echo "Deploying Gateway..."
kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/gateway.yaml -n ${NAMESPACE} | tee ~/igw-decode-heavy-deployment.log
echo "Deploying HTTPRoute..."
kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/httproute.yaml -n ${NAMESPACE} | tee ~/igw-decode-heavy-deployment.log
echo "---------------------------------------" >> ~/igw-decode-heavy-deployment.log

- name: Wait for gateway to be ready
run: |
GATEWAY_NAME=inference-gateway
kubectl wait gateway/${GATEWAY_NAME} \
--for=condition=Programmed=True \
-n "${NAMESPACE}" \
--timeout=500s
echo "✅ Gateway is ready."
kubectl get gateway -n "${NAMESPACE}"

- name: Show deployment status
run: |
echo "=== Deployments ==="
kubectl get deployments -n "${NAMESPACE}"
echo ""
echo "=== Pods ==="
kubectl get pods -n "${NAMESPACE}"
echo ""
echo "=== Services ==="
kubectl get svc -n "${NAMESPACE}"
echo ""
echo "=== Helm releases ==="
helm list -n "${NAMESPACE}" || true
echo ""
echo "=== Inference Pools ==="
kubectl get inferencepools -n "${NAMESPACE}" || true
echo ""
echo "=== HTTPRoutes ==="
kubectl get httproutes -n "${NAMESPACE}" -o yaml || true
echo ""
echo "=== Gateway ==="
kubectl get Gateway -n "${NAMESPACE}" || true
echo ""

- name: Verify installation and run validation test
run: |
cd .github/scripts/e2e
./e2e-validate.sh -n "${NAMESPACE}" -v -m ${MODEL}

- name: Run benchmarking test
run: |
TIMESTAMP=$(date +"%Y-%m-%d-%H-%M-%S")
cd benchmarking/single-workload
host="${GATEWAY_HOST:-$(kubectl get gateway -n "$NAMESPACE" \
-o jsonpath='{.items[0].status.addresses[0].value}' 2>/dev/null || true)}"
if [[ -z "$host" ]]; then
echo "Error: could not discover a Gateway address in namespace '$NAMESPACE'." >&2
exit 1
fi
port=80
svc_host="${host}:${port}"
helm install decode-heavy-benchmark ../inference-perf/ -f decode-heavy-values.yaml \
--namespace "${NAMESPACE}" \
--create-namespace \
--set hfToken="${HF_TOKEN}" \
--set "config.server.base_url=http://${svc_host}" \
--set "job.serviceAccountName=$KSA_NAME" \
--set "job.image.tag=v0.2.0" \
--set "config.storage.google_cloud_storage.bucket_name=${GCS_BUCKET}" \
--set "config.storage.google_cloud_storage.path=${NAMESPACE}/${TIMESTAMP}" \
--set "gcsPath=gs://${GCS_BUCKET}/datasets/infinity_instruct.json" \
--set "config.data.path=/gcsDataset/gcs-dataset.json" \
--set-string 'job.resources.limits.nvidia\.com/gpu=1'

- name: Wait for benchmarking job to finish
run: |
job_name=decode-heavy-benchmark-inference-perf-job
TIMEOUT_DURATION="7200s"
if ! kubectl wait --for=condition=complete job/"$job_name" -n "$NAMESPACE" --timeout="$TIMEOUT_DURATION"; then
echo "Error: Benchmark job $job_name did not complete successfully within $TIMEOUT_DURATION." >&2
echo "--- Job Description ---" >&2
kubectl describe job "$job_name" -n "$NAMESPACE" >&2
echo "--- Pod Logs (Last 50 lines) ---" >&2
kubectl logs -l job-name="$job_name" -n "$NAMESPACE" --all-containers=true --tail 50 >&2
exit 1
fi
echo "✅ Benchmarking Job Completed."

- name: Collect and upload Kubernetes pod logs
if: always()
run: |
mkdir -p pod-logs-inference-decode-heavy
cd pod-logs-inference-decode-heavy
echo "Fetching ${NAMESPACE} pods log..."
kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
| xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1'
echo "Fetching ${NAMESPACE} pods descriptions..."
kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
| xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1'
mv ~/igw-decode-heavy-deployment.log . || true
mv ~/install-deps.log . || true

- name: Upload pod logs as artifact
uses: actions/upload-artifact@v4
if: always()
with:
name: igw-pod-logs-inference-decode-heavy-${{ matrix.accelerator.name }}
path: pod-logs-inference-decode-heavy

- name: Send Google Chat notification on failure
if: failure()
uses: SimonScholz/google-chat-action@3b3519e5102dba8aa5046fd711c4b553586409bb
with:
webhookUrl: ${{ secrets.GOOGLE_CHAT_WEBHOOK }}
jobStatus: ${{ job.status }}
title: '${{ github.workflow }} - ${{ matrix.accelerator.name }}'

- name: Cleanup deployment
if: always()
run: |
GATEWAY_NAME=inference-gateway
helm uninstall vllm-llama3-8b-instruct -n ${NAMESPACE} --ignore-not-found
helm uninstall decode-heavy-benchmark -n ${NAMESPACE} --ignore-not-found
kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
28 changes: 22 additions & 6 deletions benchmarking/inference-perf/templates/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ spec:
initContainers:
- name: fetch-gcs-dataset
image: google/cloud-sdk:latest
command: ["sh", "-c", "gsutil cp {{ .Values.gcsPath }} /dataset/gcs-dataset.json"]
command: ["sh", "-c", "gsutil cp {{ .Values.gcsPath }} /gcsDataset/gcs-dataset.json"]
volumeMounts:
- name: dataset-volume
mountPath: /dataset
- name: gcs-dataset-volume
mountPath: /gcsDataset
{{- end }}
{{- if .Values.s3Path}}
initContainers:
- name: fetch-s3-dataset
image: google/cloud-sdk:latest
command: ["sh", "-c", "aws s3 cp s3://{{ .Values.s3Path }} /dataset/s3-dataset.json"]
command: ["sh", "-c", "aws s3 cp s3://{{ .Values.s3Path }} /s3Dataset/s3-dataset.json"]
volumeMounts:
- name: dataset-volume
mountPath: /dataset
- name: s3-dataset-volume
mountPath: /s3Dataset
{{- end }}
containers:
- name: inference-perf-container
Expand All @@ -58,9 +58,25 @@ spec:
- name: config-volume
mountPath: {{ include "inference-perf.configMount" . }}
readOnly: true
{{- if .Values.gcsPath}}
- name: gcs-dataset-volume
mountPath: /gcsDataset
{{- end }}
{{- if .Values.s3Path}}
- name: s3-dataset-volume
mountPath: /s3Dataset
{{- end }}
resources:
{{- toYaml .Values.job.resources | nindent 12 }}
volumes:
- name: config-volume
configMap:
name: {{ include "inference-perf.fullname" . }}-config
{{- if .Values.gcsPath}}
- name: gcs-dataset-volume
emptyDir: {}
{{- end }}
{{- if .Values.s3Path}}
- name: s3-dataset-volume
emptyDir: {}
{{- end }}
2 changes: 1 addition & 1 deletion benchmarking/single-workload/decode-heavy-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ logLevel: INFO
# A GCS bucket path that points to the dataset file.
# The file will be copied from this path to the local file system
# at /dataset/dataset.json for use during the run.
# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/gcs-dataset.json.
gcsPath: ""

# An S3 bucket path that points to the dataset file.
Expand Down