Skip to content

Commit

Permalink
feat: Use Azure Federated instead of kubelet identity (#41)
Browse files Browse the repository at this point in the history
Signed-off-by: Heba Elayoty <hebaelayoty@gmail.com>
  • Loading branch information
helayoty authored Oct 19, 2023
1 parent 7d89bc1 commit b83c6bf
Show file tree
Hide file tree
Showing 19 changed files with 200 additions and 455 deletions.
22 changes: 17 additions & 5 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ jobs:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }}

- name: Create Azure Identity
shell: bash
run: |
az identity create --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }}

- name: build gpu-provisioner image
shell: bash
run: |
Expand All @@ -82,6 +87,12 @@ jobs:
REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io
VERSION: ${{ env.VERSION }}

- name: Create Role Assignment
shell: bash
run: |
IDENTITY_PRINCIPAL_ID="$(az identity show --name gpuIdentity --resource-group ${{ env.CLUSTER_NAME }} --query 'principalId' -otsv)"
az role assignment create --assignee ${IDENTITY_PRINCIPAL_ID} --scope "/subscriptions/${{ secrets.SUBSCRIPTION_ID }}/resourceGroups/${{ env.CLUSTER_NAME }}" --role "Contributor"

- name: create cluster
shell: bash
run: |
Expand All @@ -91,18 +102,19 @@ jobs:
AZURE_ACR_NAME: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}

- name: update azure perms
- name: Create Azure Federated Identity
shell: bash
run: |
make az-perm
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
AKS_OIDC_ISSUER="$(az aks show -n "${{ env.CLUSTER_NAME }}" -g "${{ env.CLUSTER_NAME }}" --query 'oidcIssuerProfile.issuerUrl' -otsv)"
az identity federated-credential create --name gpu-fed-credential --identity-name gpuIdentity --resource-group "${{ env.CLUSTER_NAME }}" \
--issuer "${AKS_OIDC_ISSUER}" --subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange

- name: Install GPU-Provisioner helm chart
shell: bash
run: |
echo "IDENTITY_CLIENT_ID=$(az identity show --name gpuIdentity -g "${{ env.CLUSTER_NAME }}" --query 'clientId' -otsv)" >> $GITHUB_ENV
make az-patch-helm
helm install gpu-provisioner ./charts/gpu-provisioner
kubectl wait --for=condition=available deploy "gpu-provisioner" -n gpu-provisioner --timeout=300s
env:
AZURE_RESOURCE_GROUP: ${{ env.CLUSTER_NAME }}
Expand Down
39 changes: 22 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,44 +60,50 @@ az-mkacr: az-mkrg ## Create test ACR
az acr create --name $(AZURE_ACR_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --sku Standard --admin-enabled -o none
az acr login --name $(AZURE_ACR_NAME)

az-mkaks: az-mkacr ## Create test AKS cluster (with --vm-set-type AvailabilitySet for compatibility with standalone VMs)
az-mkaks: az-mkacr ## Create test AKS cluster (with msi, oidc and workload identity enabled)
az aks create --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) --attach-acr $(AZURE_ACR_NAME) \
--enable-managed-identity --node-count 1 --generate-ssh-keys --vm-set-type VirtualMachineScaleSets -o none
--node-count 1 --generate-ssh-keys --enable-managed-identity --enable-workload-identity --enable-oidc-issuer -o none
az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP)

az-rmrg: ## Destroy test ACR and AKS cluster by deleting the resource group (use with care!)
az group delete --name $(AZURE_RESOURCE_GROUP)

.PHONE: az-identity-perm
az-identity-perm: ## Create identity for gpu-provisioner
az identity create --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP)

IDENTITY_PRINCIPAL_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'principalId')
IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --subscription $(AZURE_SUBSCRIPTION_ID) --query 'clientId')

az role assignment create --assignee $(IDENTITY_PRINCIPAL_ID) --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) --role "Contributor"

AKS_OIDC_ISSUER=$(shell az aks show -n "$(AZURE_CLUSTER_NAME)" -g "$(AZURE_RESOURCE_GROUP)" --subscription $(AZURE_SUBSCRIPTION_ID) --query "oidcIssuerProfile.issuerUrl")

az identity federated-credential create --name gpu-federatecredential --identity-name gpuIdentity --resource-group "$(AZURE_RESOURCE_GROUP)" --issuer "$(AKS_OIDC_ISSUER)" \
--subject system:serviceaccount:"gpu-provisioner:gpu-provisioner" --audience api://AzureADTokenExchange --subscription $(AZURE_SUBSCRIPTION_ID)

.PHONY: az-patch-helm
az-patch-helm: ## Update Azure client env vars and settings in helm values.yml
az aks get-credentials --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP)
$(eval AZURE_CLIENT_ID=$(shell az aks show --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) | jq -r ".identityProfile.kubeletidentity.clientId"))
$(eval AZURE_SUBNET_ID=$(shell az network vnet list --resource-group $(AZURE_RESOURCE_GROUP_MC) | jq -r ".[0].subnets[0].id"))
$(eval CLUSTER_ENDPOINT=$(shell kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}'))
$(eval IDENTITY_CLIENT_ID=$(shell az identity show --name gpuIdentity --resource-group $(AZURE_RESOURCE_GROUP) --query 'clientId' -o tsv))
$(eval AZURE_TENANT_ID=$(shell az account show | jq -r ".tenantId"))

yq -i '(.controller.image.repository) = "$(REGISTRY)/gpu-provisioner"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.image.tag) = "$(IMG_TAG)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="ARM_SUBSCRIPTION_ID")) .value = "$(AZURE_SUBSCRIPTION_ID)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="LOCATION")) .value = "$(AZURE_LOCATION)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="ARM_USER_ASSIGNED_IDENTITY_ID")) .value = "$(AZURE_CLIENT_ID)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="ARM_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="AZURE_NODE_RESOURCE_GROUP")) .value = "$(AZURE_RESOURCE_GROUP_MC)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="AZURE_CLUSTER_NAME")) .value = "$(AZURE_CLUSTER_NAME)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.controller.env[] | select(.name=="AZURE_SUBNET_ID")) .value = "$(AZURE_SUBNET_ID)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.settings.azure.clusterName) = "$(AZURE_CLUSTER_NAME)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.workloadIdentity.clientId) = "$(IDENTITY_CLIENT_ID)"' ./charts/gpu-provisioner/values.yaml
yq -i '(.workloadIdentity.tenantId) = "$(AZURE_TENANT_ID)"' ./charts/gpu-provisioner/values.yaml

helm install gpu-provisioner ./charts/gpu-provisioner

az-perm: ## Create role assignments to let Karpenter manage VMs and Network
$(eval AZURE_CLIENT_ID=$(shell az aks show --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) | jq -r ".identityProfile.kubeletidentity.objectId"))
az role assignment create --assignee-object-id $(AZURE_CLIENT_ID) --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP_MC) \
--role "Virtual Machine Contributor" --assignee-principal-type ServicePrincipal
az role assignment create --assignee-object-id $(AZURE_CLIENT_ID) --scope /subscriptions/$(AZURE_SUBSCRIPTION_ID)/resourceGroups/$(AZURE_RESOURCE_GROUP) \
--role "Contributor" --assignee-principal-type ServicePrincipal
#kubectl annotate sa gpu-provisioner -n gpu-provisioner azure.workload.identity/tenant-id="$(AZURE_TENANT_ID)" --overwrite

az-perm-acr:
$(eval AZURE_CLIENT_ID=$(shell az aks show --name $(AZURE_CLUSTER_NAME) --resource-group $(AZURE_RESOURCE_GROUP) | jq -r ".identityProfile.kubeletidentity.clientId"))
$(eval AZURE_ACR_ID=$(shell az acr show --name $(AZURE_ACR_NAME) --resource-group $(AZURE_RESOURCE_GROUP) | jq -r ".id"))
$(eval AZURE_ACR_ID=$(shell az acr show --name $(AZURE_ACR_NAME) --resource-group $(AZURE_RESOURCE_GROUP) | jq -r ".id"))
az role assignment create --assignee $(AZURE_CLIENT_ID) --scope $(AZURE_ACR_ID) --role "AcrPull"

az-build: ## Build the gpu-provisioner controller
Expand Down Expand Up @@ -177,7 +183,6 @@ e2etests: ## Run the e2e suite against your local cluster
-timeout ${TEST_TIMEOUT} \
-v \
./e2e/suites/suite_test.go \
--ginkgo.focus="${FOCUS}" \
--ginkgo.timeout=${TEST_TIMEOUT} \
--ginkgo.grace-period=3m \
--ginkgo.vv
11 changes: 0 additions & 11 deletions charts/gpu-provisioner/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,6 @@ app.kubernetes.io/name: {{ include "gpu-provisioner.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "gpu-provisioner.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "gpu-provisioner.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

{{/*
gpu-provisioner image to use
*/}}
Expand Down
2 changes: 1 addition & 1 deletion charts/gpu-provisioner/templates/clusterrole-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ roleRef:
name: {{ include "gpu-provisioner.fullname" . }}-core
subjects:
- kind: ServiceAccount
name: {{ template "gpu-provisioner.serviceAccountName" . }}
name: gpu-provisioner
namespace: {{ .Values.namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
30 changes: 3 additions & 27 deletions charts/gpu-provisioner/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ metadata:
name: {{ include "gpu-provisioner.fullname" . }}
namespace: {{ .Values.namespace }}
labels:
azure.workload.identity/use: "true"
{{- include "gpu-provisioner.labels" . | nindent 4 }}
{{- with .Values.additionalAnnotations }}
annotations:
Expand All @@ -29,6 +30,7 @@ spec:
template:
metadata:
labels:
azure.workload.identity/use: "true"
{{- include "gpu-provisioner.selectorLabels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
Expand All @@ -43,7 +45,7 @@ spec:
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "gpu-provisioner.serviceAccountName" . }}
serviceAccountName: gpu-provisioner
{{- with .Values.podSecurityContext }}
securityContext:
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -85,12 +87,6 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: MEMORY_LIMIT
valueFrom:
resourceFieldRef:
containerName: controller
divisor: "0"
resource: limits.memory
{{- with .Values.controller.env }}
{{- toYaml . | nindent 12 }}
{{- end }}
Expand Down Expand Up @@ -118,22 +114,6 @@ spec:
resources:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.controller.extraVolumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.controller.sidecarContainer }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if and (.Values.controller.sidecarContainer) (or .Values.controller.extraVolumeMounts .Values.controller.sidecarVolumeMounts) }}
volumeMounts:
{{- with .Values.controller.extraVolumeMounts }}
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.controller.sidecarVolumeMounts }}
{{- toYaml . | nindent 12 }}
{{- end }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand All @@ -154,7 +134,3 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.extraVolumes }}
volumes:
{{- toYaml . | nindent 8 }}
{{- end }}
4 changes: 2 additions & 2 deletions charts/gpu-provisioner/templates/rolebinding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ roleRef:
name: {{ include "gpu-provisioner.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ template "gpu-provisioner.serviceAccountName" . }}
name: gpu-provisioner
namespace: {{ .Values.namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
Expand All @@ -35,5 +35,5 @@ roleRef:
name: {{ include "gpu-provisioner.fullname" . }}-dns
subjects:
- kind: ServiceAccount
name: {{ template "gpu-provisioner.serviceAccountName" . }}
name: gpu-provisioner
namespace: {{ .Values.namespace }}
14 changes: 3 additions & 11 deletions charts/gpu-provisioner/templates/serviceaccount.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,10 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "gpu-provisioner.serviceAccountName" . }}
name: gpu-provisioner
namespace: {{ .Values.namespace }}
labels:
{{- include "gpu-provisioner.labels" . | nindent 4 }}
{{- if or .Values.additionalAnnotations .Values.serviceAccount.annotations }}
annotations:
{{- with .Values.additionalAnnotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.serviceAccount.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
{{- end -}}
azure.workload.identity/client-id: {{ .Values.workloadIdentity.clientId }}
azure.workload.identity/tenant-id: {{ .Values.workloadIdentity.tenantId }}
17 changes: 4 additions & 13 deletions charts/gpu-provisioner/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,12 @@ controller:
value:
- name: LOCATION
value:
- name: ARM_USE_MANAGED_IDENTITY_EXTENSION
value: "true"
- name: ARM_USER_ASSIGNED_IDENTITY_ID
value:
- name: AZURE_CLUSTER_NAME
value:
- name: AZURE_NODE_RESOURCE_GROUP
value:
- name: ARM_RESOURCE_GROUP
value:
# TODO: move to settings
- name: AZURE_SUBNET_ID # the id of subnet to create network interfaces on
value:
- name: LEADER_ELECT # disable leader election for better debugging experience
value: "false"
envFrom: []
Expand All @@ -146,11 +139,6 @@ controller:
logLevel: debug
# -- Controller log encoding, defaults to the global log encoding
logEncoding: ""
# -- Additional volumeMounts for the controller pod.
extraVolumeMounts: []
sidecarContainer: []
# -- Additional volumeMounts for the sidecar - this will be added to the volume mounts on top of extraVolumeMounts
sidecarVolumeMounts: []
metrics:
# -- The container port to use for metrics.
port: 8000
Expand All @@ -162,11 +150,14 @@ logLevel: debug
# -- Global log encoding
logEncoding: console
# -- Global Settings to configure gpu-provisioner
workloadIdentity:
clientId: ""
tenantId: ""
settings:
# -- Azure-specific configuration values
azure:
# -- Cluster name.
clusterName: new_demo
clusterName: fed-gpu
# -- The global tags to use on all Azure infrastructure resources (VMs, etc.)
# TODO: not propagated yet ...
tags:
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.1.0
github.com/Azure/go-armbalancer v0.0.2
github.com/Azure/go-autorest/autorest v0.11.29
github.com/Azure/go-autorest/autorest/adal v0.9.23
github.com/Azure/go-autorest/autorest/to v0.4.0
github.com/Azure/skewer v0.0.19
github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1
github.com/aws/karpenter-core v0.29.2
github.com/go-playground/validator/v10 v10.13.0
github.com/onsi/ginkgo/v2 v2.11.0
Expand All @@ -38,11 +38,11 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/internal v1.3.0 // indirect
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 // indirect
github.com/Azure/go-autorest v14.2.0+incompatible // indirect
github.com/Azure/go-autorest/autorest/adal v0.9.23 // indirect
github.com/Azure/go-autorest/autorest/date v0.3.0 // indirect
github.com/Azure/go-autorest/autorest/validation v0.3.1 // indirect
github.com/Azure/go-autorest/logger v0.2.1 // indirect
github.com/Azure/go-autorest/tracing v0.6.0 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1 // indirect
github.com/Pallinder/go-randomdata v1.2.0 // indirect
github.com/avast/retry-go v3.0.0+incompatible // indirect
github.com/benbjohnson/clock v1.1.0 // indirect
Expand Down
Loading

0 comments on commit b83c6bf

Please sign in to comment.