Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Secondary boot disk pipeline #1330

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
11 changes: 11 additions & 0 deletions ai-ml/secondary-boot-disk/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# use the specified base image
FROM us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240220_0936_RC01 as base_image

# declare a new build argument
ARG MODEL_PATH

# set the model path as an environment variable
ENV MODEL_PATH=${MODEL_PATH}

# copy model files from the first stage build
COPY ./$MODEL_PATH /app/$MODEL_PATH
28 changes: 28 additions & 0 deletions ai-ml/secondary-boot-disk/cloudbuild-disk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
steps:
# Build step 1 where a Cloud Storage bucket to store the execution logs of gke-disk-image-builder is created
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
entrypoint: "/bin/bash"
args:
- "-c"
- |
gcloud storage buckets list | grep "${_BUCKET_NAME}" || gcloud storage buckets create ${_BUCKET_NAME} --location=${_REGION} --uniform-bucket-level-access

# Build step 2 where the repo with gke-disk-image-builder is cloned
- name: 'gcr.io/cloud-builders/git'
args: ['clone', 'https://github.com/GoogleCloudPlatform/ai-on-gke.git']

# Build step 3 where gke-disk-image-builder is run with required parameters
- name: 'gcr.io/cloud-builders/go:1.21'
env: ['GOPATH=./ai-on-gke/tools/gke-disk-image-builder']
dir: './ai-on-gke/tools/gke-disk-image-builder'
args:
- 'run'
- './cli'
- --project-name=$PROJECT_ID
- --image-name=${_DISK_IMAGE}
- --zone=${_ZONE}
- --gcs-path=${_BUCKET_NAME}
- --disk-size-gb=100
- --timeout=120m
- --container-image=${_CONTAINER_IMAGE}
- --image-pull-auth=ServiceAccountToken
37 changes: 37 additions & 0 deletions ai-ml/secondary-boot-disk/cloudbuild-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
steps:
# Build step 1 where the model is cloned using a git image
- name: 'gcr.io/cloud-builders/git'
entrypoint: 'bash'
args:
- '-c'
- |
df -h && \
apt-get update && \
apt-get install -y git-lfs && \
git lfs install && \
git lfs clone https://$$HF_USERNAME:$$HF_TOKEN@huggingface.co/google/${_MODEL_PATH} ./${_MODEL_PATH}
secretEnv: ['HF_USERNAME', 'HF_TOKEN']

# Build step 2 where the Docker image is built
- name: 'gcr.io/cloud-builders/docker'
args:
- 'build'
- '--build-arg'
- 'MODEL_PATH=${_MODEL_PATH}'
- '-t'
- '$_REGION-docker.pkg.dev/$_PROJECT_ID/$_REPOSITORY_NAME/$_IMAGE_NAME:${_IMAGE_TAG}'
- '.'

options:
machineType: 'E2_HIGHCPU_32'
diskSizeGb: 250

images:
- '$_REGION-docker.pkg.dev/$_PROJECT_ID/$_REPOSITORY_NAME/$_IMAGE_NAME:${_IMAGE_TAG}'

availableSecrets:
secretManager:
- versionName: projects/$_PROJECT_ID/secrets/hf-username/versions/latest
env: 'HF_USERNAME'
- versionName: projects/$_PROJECT_ID/secrets/hf-token/versions/latest
env: 'HF_TOKEN'
68 changes: 68 additions & 0 deletions ai-ml/secondary-boot-disk/model-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-gemma-deployment
spec:
replicas: 1
selector:
matchLabels:
app: gemma-server
template:
metadata:
labels:
app: gemma-server
ai.gke.io/model: gemma-2b-it
ai.gke.io/inference-server: vllm
examples.ai.gke.io/source: user-guide
spec:
containers:
- name: inference-server
image: <CONTAINER_IMAGE>
readinessProbe:
httpGet:
path: http://localhost/health
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
resources:
requests:
cpu: "2"
memory: "7Gi"
ephemeral-storage: "10Gi"
nvidia.com/gpu: 1
limits:
cpu: "2"
memory: "7Gi"
ephemeral-storage: "10Gi"
nvidia.com/gpu: 1
command: ["python3", "-m", "vllm.entrypoints.api_server"]
args:
- --model=$(MODEL_ID)
- --tensor-parallel-size=1
env:
- name: MODEL_ID
value: /app/gemma-2b-it
- name: PORT
value: "8000"
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
---
apiVersion: v1
kind: Service
metadata:
name: llm-service
spec:
selector:
app: gemma-server
type: ClusterIP
ports:
- protocol: TCP
port: 8000
targetPort: 8000
96 changes: 96 additions & 0 deletions ai-ml/secondary-boot-disk/run-commands.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Ensure that the project_id is set
gcloud config set project PROJECT_ID

# Set the required environment variables
export PROJECT_ID=$(gcloud config get project) \
&& export PROJECT_NUMBER=$(gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)") \
&& export REGION=REGION \
&& export ZONE=$REGION-a \
&& export REPOSITORY_NAME=AR_REPOSITORY_NAME \
&& export MODEL_PATH=MODEL_PATH_NAME \
&& export IMAGE_NAME=$MODEL_PATH-container-image\
&& export IMAGE_TAG=IMAGE_VERSION_TAG \
&& export DISK_IMAGE=$MODEL_PATH-disk-image\
&& export CONTAINER_IMAGE=$REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY_NAME/$IMAGE_NAME:$IMAGE_TAG \
&& export BUCKET_NAME=gs://BUCKET_FOR_LOGS_NAME/ \
&& export CLUSTER_NAME=CLUSTER_NAME

# Add the Hugging Face username and Hugging Face user token to the cloud secrets
echo -n 'YOUR_HUGGINGFACE_USER_NAME' | gcloud secrets create hf-username --data-file=- \
&& echo -n 'YOUR_HUGGINGFACE_USER_TOKEN' | gcloud secrets create hf-token --data-file=-

# create a repository in the artifact registr
gcloud artifacts repositories create $REPOSITORY_NAME \
--repository-format=docker \
--location=$REGION \
--description="repository to store the container images with the preloaded model weights and the configuration files"

# Add the required permissions to the default Cloud Build service account
gcloud projects add-iam-policy-binding projects/$PROJECT_ID \
--member="serviceAccount:$PROJECT_NUMBER@cloudbuild.gserviceaccount.com" \
--role="roles/compute.instanceAdmin.v1" \
--role="roles/iam.serviceAccountUser" \
--role="roles/secretmanager.secretAccessor" \
--condition=None

# Run the Cloud Build command with substitutions
gcloud builds submit \
--config cloudbuild-image.yaml \
--substitutions=_MODEL_PATH=$MODEL_PATH,_PROJECT_ID=$PROJECT_ID,_REPOSITORY_NAME=$REPOSITORY_NAME,_IMAGE_NAME=$IMAGE_NAME,_REGION=$REGION,_IMAGE_TAG=$IMAGE_TAG

# Check the existing container image
gcloud artifacts docker images list $REGION-docker.pkg.dev/$PROJECT_ID/$REPOSITORY_NAME

# Run the Cloud Build command with substitutions
gcloud builds submit --config cloudbuild-disk.yaml --no-source \
--substitutions=_DISK_IMAGE=$DISK_IMAGE,_CONTAINER_IMAGE=$CONTAINER_IMAGE,_BUCKET_NAME=$BUCKET_NAME,_REGION=$REGION,_ZONE=$ZONE

# check the existing disk image
gcloud compute images list --no-standard-images

# Create a GKE Standard cluster
gcloud container clusters create ${CLUSTER_NAME} \
--project=${PROJECT_ID} \
--region=${REGION} \
--workload-pool=${PROJECT_ID}.svc.id.goog \
--release-channel=rapid \
--cluster-version=1.28 \
--num-nodes=1 \
--enable-image-streaming

# Create a node pool with a secondary boot disk
gcloud beta container node-pools create gpupool \
--accelerator type=nvidia-l4,count=2,gpu-driver-version=latest \
--project=${PROJECT_ID} \
--location=${REGION} \
--node-locations=${ZONE} \
--cluster=${CLUSTER_NAME} \
--machine-type=g2-standard-24 \
--num-nodes=1 \
--disk-size 200 \
--enable-image-streaming \
--secondary-boot-disk=disk-image=projects/${PROJECT_ID}/global/images/${DISK_IMAGE},mode=CONTAINER_IMAGE_CACHE

# Apply the deployment and change the placeholder with the name of the container image
sed "s|<CONTAINER_IMAGE>|$CONTAINER_IMAGE|" model-deployment.yaml | kubectl apply -f -

# Clean-up section
gcloud secrets delete hf-username \
--quiet \
&& gcloud secrets delete hf-token \
--quiet \
&& gcloud artifacts repositories delete $REPOSITORY_NAME \
--location=$REGION \
--quiet \
&& gcloud projects remove-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:$PROJECT_NUMBER@cloudbuild.gserviceaccount.com" \
--role="roles/compute.instanceAdmin.v1" \
--role="roles/iam.serviceAccountUser" \
--role="roles/secretmanager.secretAccessor" \
--condition=None \
&& gsutil -m rm -rf $BUCKET_NAME \
&& gcloud compute images delete $DISK_IMAGE \
--quiet \
&& gcloud container clusters delete $CLUSTER_NAME \
--region=$REGION \
--quiet