From 4cb675c6609332f82b3fb4d68bbf970746f2e876 Mon Sep 17 00:00:00 2001 From: Heba <31887807+helayoty@users.noreply.github.com> Date: Thu, 2 May 2024 18:39:23 -0700 Subject: [PATCH 1/2] Create gpu-provisioner helm values template Signed-off-by: Heba <31887807+helayoty@users.noreply.github.com> Signed-off-by: Heba Elayoty --- gpu-provisioner-values-template.yaml | 23 +++++++++++++++++++ hack/deploy/configure-helm-values.sh | 34 ++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 gpu-provisioner-values-template.yaml create mode 100755 hack/deploy/configure-helm-values.sh diff --git a/gpu-provisioner-values-template.yaml b/gpu-provisioner-values-template.yaml new file mode 100644 index 00000000..71d1f3e7 --- /dev/null +++ b/gpu-provisioner-values-template.yaml @@ -0,0 +1,23 @@ + +replicas: 1 # for better debugging experience +controller: + env: + # Azure client settings + - name: ARM_SUBSCRIPTION_ID + value: ${AZURE_SUBSCRIPTION_ID} + - name: LOCATION + value: ${AZURE_LOCATION} + - name: AZURE_CLUSTER_NAME + value: ${CLUSTER_NAME} + - name: AZURE_NODE_RESOURCE_GROUP + value: ${AZURE_RESOURCE_GROUP_MC} + - name: ARM_RESOURCE_GROUP + value: ${AZURE_RESOURCE_GROUP} + - name: LEADER_ELECT # disable leader election for better debugging experience + value: "false" + - name: E2E_TEST_MODE + value: "false" + +workloadIdentity: + clientId: ${GPU_PROVISIONER_USER_ASSIGNED_CLIENT_ID} + tenantId: ${AZURE_TENANT_ID} diff --git a/hack/deploy/configure-helm-values.sh b/hack/deploy/configure-helm-values.sh new file mode 100755 index 00000000..dc07e287 --- /dev/null +++ b/hack/deploy/configure-helm-values.sh @@ -0,0 +1,34 @@ +// https://github.com/Azure/karpenter-provider-azure/blob/2beb773cbd3134eeabb8c96b72a130b86b1a91e1/hack/deploy/configure-values.sh + +#!/usr/bin/env bash +set -euo pipefail + +# This script interrogates the AKS cluster and Azure resources to generate +# the gpu-provisioner-values.yaml file using the gpu-provisioner-values-template.yaml file as a template. + +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +echo "Configuring gpu-provisioner-values.yaml for cluster $1 in resource group $2 ..." + +CLUSTER_NAME=$1 +AZURE_RESOURCE_GROUP=$2 +AZURE_GPU_PROVISIONER_USER_ASSIGNED_IDENTITY_NAME=$3 + +AKS_JSON=$(az aks show --name "$CLUSTER_NAME" --resource-group "$AZURE_RESOURCE_GROUP") +AZURE_LOCATION=$(jq -r ".location" <<< "$AKS_JSON") +AZURE_RESOURCE_GROUP_MC=$(jq -r ".nodeResourceGroup" <<< "$AKS_JSON") +AZURE_TENANT_ID=$(az account show |jq -r ".tenantId") + + +GPU_PROVISIONER_USER_ASSIGNED_CLIENT_ID=$(az identity show --resource-group "${AZURE_RESOURCE_GROUP}" --name "${AZURE_GPU_PROVISIONER_USER_ASSIGNED_IDENTITY_NAME}" --query 'clientId' -otsv) + +export CLUSTER_NAME AZURE_LOCATION AZURE_RESOURCE_GROUP_MC GPU_PROVISIONER_USER_ASSIGNED_CLIENT_ID AZURE_TENANT_ID + +# get gpu-provisioner-values-template.yaml, if not already present (e.g. outside of repo context) +if [ ! -f gpu-provisioner-values-template.yaml ]; then + curl -sO https://raw.githubusercontent.com/Azure/gpu-provisioner/main/gpu-provisioner-values-template.yaml +fi +yq '(.. | select(tag == "!!str")) |= envsubst(nu)' gpu-provisioner-values-template.yaml > gpu-provisioner-values.yaml \ No newline at end of file From 498d80f578b07dd2048a734b4a79ad70f89b1e18 Mon Sep 17 00:00:00 2001 From: Heba Elayoty Date: Fri, 3 May 2024 12:48:13 -0700 Subject: [PATCH 2/2] Update documentation Signed-off-by: Heba Elayoty --- Makefile | 31 ++++++++++++++++--------------- README.md | 16 ++++++++-------- charts/gpu-provisioner/README.md | 17 +++++++++++++++-- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/Makefile b/Makefile index dd198fe0..c596c68b 100644 --- a/Makefile +++ b/Makefile @@ -152,21 +152,6 @@ vet: ## Run go vet against code. lint: $(GOLANGCI_LINT) $(GOLANGCI_LINT) run -v -## -------------------------------------- -## Release -## To create a release, run `make release VERSION=x.y.z` -## -------------------------------------- -.PHONY: release-manifest -release-manifest: - @sed -i -e 's/^VERSION ?= .*/VERSION ?= ${VERSION}/' ./Makefile - @sed -i -e "s/version: .*/version: ${IMG_TAG}/" ./charts/gpu-provisioner/Chart.yaml - @sed -i -e "s/appVersion: .*/appVersion: ${IMG_TAG}/" ./charts/gpu-provisioner/Chart.yaml - @sed -i -e "s/tag: .*/tag: ${IMG_TAG}/" ./charts/gpu-provisioner/values.yaml - @sed -i -e 's/gpu-provisioner: .*/gpu-provisioner:${IMG_TAG}/' ./charts/gpu-provisioner/README.md - git checkout -b release-${VERSION} - git add ./Makefile ./charts/gpu-provisioner/Chart.yaml ./charts/gpu-provisioner/values.yaml ./charts/gpu-provisioner/README.md - git commit -s -m "release: update manifest and helm charts for ${VERSION}" - ## -------------------------------------- ## Tests ## -------------------------------------- @@ -188,3 +173,19 @@ e2etests: ## Run the e2e suite against your local cluster --ginkgo.timeout=${TEST_TIMEOUT} \ --ginkgo.grace-period=3m \ --ginkgo.vv + +## -------------------------------------- +## Release +## To create a release, run `make release VERSION=x.y.z` +## -------------------------------------- +.PHONY: release-manifest +release-manifest: + @sed -i -e 's/^VERSION ?= .*/VERSION ?= ${VERSION}/' ./Makefile + @sed -i -e "s/version: .*/version: ${IMG_TAG}/" ./charts/gpu-provisioner/Chart.yaml + @sed -i -e "s/appVersion: .*/appVersion: ${IMG_TAG}/" ./charts/gpu-provisioner/Chart.yaml + @sed -i -e "s/tag: .*/tag: ${IMG_TAG}/" ./charts/gpu-provisioner/values.yaml + @sed -i -e 's/gpu-provisioner: .*/gpu-provisioner:${IMG_TAG}/' ./charts/gpu-provisioner/README.md + @sed -i -e 's/CHART_VERSION=.*/CHART_VERSION=${IMG_TAG}/' ./charts/gpu-provisioner/README.md + git checkout -b release-${VERSION} + git add ./Makefile ./charts/gpu-provisioner/Chart.yaml ./charts/gpu-provisioner/values.yaml ./charts/gpu-provisioner/README.md + git commit -s -m "release: update manifest and helm charts for ${VERSION}" diff --git a/README.md b/README.md index 159bb2ed..1cd0cbff 100644 --- a/README.md +++ b/README.md @@ -8,15 +8,15 @@ gpu-Provisioner is an [Azure Karpenter provider](https://github.com/Azure/karpen It implements the cloud provider interfaces to realize the following abstraction: `machine` -> `AKS agent pool` (with vmss and a hard limit of VM count to 1) -``` -VERSION=v0.2.0 make docker-build -make az-identity-perm -make az-patch-helm -helm install gpu-provisioner /charts/gpu-provisioner --namespace gpu-provisioner --create-namespace -make az-federated-credential -``` -You should have a running controller in `gpu-provisioner` namespace. +## Prerequisites +- An Azure subscription. +- An AKS cluster with [OIDC](https://learn.microsoft.com/en-us/azure/aks/use-oidc-issuer) addon installed. Please refer to the [Karpenter installation guide](https://karpenter.sh/docs/installation/) for more details. +- +## Install gpu-provisioner +Please check the installation guidance [here](./charts/gpu-provisioner/README.md). + +```shell ## How to test After deploying the controller successfully, one can apply the yaml in `/examples` to create a machine CR. A real node will be created and added to the cluster by the controller. diff --git a/charts/gpu-provisioner/README.md b/charts/gpu-provisioner/README.md index 22b537f6..8ae3970b 100644 --- a/charts/gpu-provisioner/README.md +++ b/charts/gpu-provisioner/README.md @@ -9,7 +9,20 @@ A Helm chart for gpu-provisioner To install the chart with the release name `gpu-provisioner`: ```bash -helm install gpu-provisioner ./charts/gpu-provisioner --namespace gpu-provisioner --create-namespace +export CHART_VERSION=0.2.0 +export CLUSTER_NAME=my-cluster +export AZURE_RESOURCE_GROUP=my-rg +export AZURE_SUBSCRIPTION_ID=my-subscription-id +export MSI_NAME=gpuIdentity + +az identity create --name $MSI_NAME --resource-group $CLUSTER_NAME + +./hack/deploy/configure-helm-values.sh $CLUSTER_NAME $AZURE_RESOURCE_GROUP $MSI_NAME + +helm install gpu-provisioner \ + https://github.com/Azure/gpu-provisioner/raw/gh-pages/charts/gpu-provisioner-$CHART_VERSION.tgz \ + --values gpu-provisioner-values.yaml --namespace gpu-provisioner --create-namespace --wait +make az-federated-credential ``` ## Values @@ -47,7 +60,7 @@ helm install gpu-provisioner ./charts/gpu-provisioner --namespace gpu-provisione | podLabels | object | `{}` | Additional labels for the pod. | | podSecurityContext | object | `{"fsGroup":1000}` | SecurityContext for the pod. | | priorityClassName | string | `"system-cluster-critical"` | PriorityClass name for the pod. | -| replicas | int | `2` | Number of replicas. | +| replicas | int | `1` | Number of replicas. | | revisionHistoryLimit | int | `10` | The number of old ReplicaSets to retain to allow rollback. | | serviceAccount.annotations | object | `{}` | Additional annotations for the ServiceAccount. | | serviceAccount.create | bool | `true` | Specifies if a ServiceAccount should be created. |