From 48e51ae7f6aa0259dac53d2eb56d4fbc30b60841 Mon Sep 17 00:00:00 2001 From: Oksana Baranova Date: Sun, 5 Jan 2025 00:02:14 +0200 Subject: [PATCH] Add NFD rule for Gaudi resource driver (#69) * add nfd rule Signed-off-by: Oksana Baranova --- charts/intel-gaudi-resource-driver/Chart.yaml | 12 +++++++++-- charts/intel-gaudi-resource-driver/README.md | 6 ++++-- .../templates/device-class.yaml | 2 +- .../templates/nfd.yaml | 16 +++++++++++++++ .../templates/resource-driver.yaml | 5 +++++ .../validating-admission-policy.yaml | 2 +- .../intel-gaudi-resource-driver/values.yaml | 20 ++++++++++++++++--- 7 files changed, 54 insertions(+), 9 deletions(-) create mode 100644 charts/intel-gaudi-resource-driver/templates/nfd.yaml diff --git a/charts/intel-gaudi-resource-driver/Chart.yaml b/charts/intel-gaudi-resource-driver/Chart.yaml index 0e78390..11b1766 100644 --- a/charts/intel-gaudi-resource-driver/Chart.yaml +++ b/charts/intel-gaudi-resource-driver/Chart.yaml @@ -3,5 +3,13 @@ name: intel-gaudi-resource-driver description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel Gaudi Resource Driver type: application -version: 0.2.0 -appVersion: "v0.2.0" +version: 0.3.0 +appVersion: "v0.3.0" +home: https://github.com/intel/helm-charts + +dependencies: + - name: node-feature-discovery + alias: nfd + version: "0.16.6" + condition: nfd.enabled + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts diff --git a/charts/intel-gaudi-resource-driver/README.md b/charts/intel-gaudi-resource-driver/README.md index b54c538..9868908 100644 --- a/charts/intel-gaudi-resource-driver/README.md +++ b/charts/intel-gaudi-resource-driver/README.md @@ -16,7 +16,9 @@ helm repo update You can execute `helm search repo intel` command to see pulled charts [optional]. ## Install Helm Chart +When installing, update the dependencies: ``` +helm dependency update helm install intel-gaudi-resource-driver intel/intel-gaudi-resource-driver ``` ## Upgrade Chart @@ -43,7 +45,7 @@ You may also run `helm show values` on this chart's dependencies for additional | image.repository | string | `intel` | | image.name | string | `"intel-gaudi-resource-driver"` | | image.pullPolicy | string | `"IfNotPresent"` | -| image.tag | string | `"v0.2.0"` | +| image.tag | string | `"v0.3.0"` | > [!Note] -> When upgrading, CRDs from previous version need to be removed manually because Helm supports neither upgrading nor deleting CRDs, see: https://github.com/helm/community/blob/main/hips/hip-0011.md +> If you change the image tag to be used in Helm chart deployment, ensure that the version of the container image is consistent with deployment YAMLs - they might change between releases. diff --git a/charts/intel-gaudi-resource-driver/templates/device-class.yaml b/charts/intel-gaudi-resource-driver/templates/device-class.yaml index 6be276b..4628a2d 100644 --- a/charts/intel-gaudi-resource-driver/templates/device-class.yaml +++ b/charts/intel-gaudi-resource-driver/templates/device-class.yaml @@ -1,4 +1,4 @@ -apiVersion: resource.k8s.io/v1alpha3 +apiVersion: resource.k8s.io/v1beta1 kind: DeviceClass metadata: name: gaudi.intel.com diff --git a/charts/intel-gaudi-resource-driver/templates/nfd.yaml b/charts/intel-gaudi-resource-driver/templates/nfd.yaml new file mode 100644 index 0000000..92fe18a --- /dev/null +++ b/charts/intel-gaudi-resource-driver/templates/nfd.yaml @@ -0,0 +1,16 @@ +{{- if .Values.nfd.enabled }} +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: intel-gaudi-device-rule +spec: + rules: + - name: "intel.gaudi" + labels: + "intel.feature.node.kubernetes.io/gaudi": "true" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1da3"]} + device: {op: In, value: ["1020", "1030"]} +{{- end }} \ No newline at end of file diff --git a/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml b/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml index b95679f..e53872c 100644 --- a/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml +++ b/charts/intel-gaudi-resource-driver/templates/resource-driver.yaml @@ -73,10 +73,15 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.nfd.enabled }} + nodeSelector: + intel.feature.node.kubernetes.io/gaudi: "true" + {{- else }} {{- with .Values.kubeletPlugin.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} + {{- end }} {{- with .Values.kubeletPlugin.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml b/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml index a432733..7e779e4 100644 --- a/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml +++ b/charts/intel-gaudi-resource-driver/templates/validating-admission-policy.yaml @@ -7,7 +7,7 @@ spec: matchConstraints: resourceRules: - apiGroups: ["resource.k8s.io"] - apiVersions: ["v1alpha3"] + apiVersions: ["v1beta1"] operations: ["CREATE", "UPDATE", "DELETE"] resources: ["resourceslices"] matchConditions: diff --git a/charts/intel-gaudi-resource-driver/values.yaml b/charts/intel-gaudi-resource-driver/values.yaml index d87b4ba..0d41f56 100644 --- a/charts/intel-gaudi-resource-driver/values.yaml +++ b/charts/intel-gaudi-resource-driver/values.yaml @@ -9,7 +9,7 @@ image: repository: intel name: intel-gaudi-resource-driver pullPolicy: IfNotPresent - tag: "v0.2.0" + tag: "v0.3.0" serviceAccount: create: true @@ -19,6 +19,9 @@ serviceAccount: kubeletPlugin: podAnnotations: {} + nodeSelector: {} + # label used when nfd.enabled is true + #intel.feature.node.kubernetes.io/gaudi: "true" tolerations: - key: node-role.kubernetes.io/master operator: Exists @@ -26,6 +29,17 @@ kubeletPlugin: - key: node-role.kubernetes.io/control-plane operator: Exists effect: NoSchedule - nodeSelector: {} - #node-role.kubernetes.io/control-plane: "" + # Refer to the official documentation for Node Feature Discovery (NFD) + # regarding node tainting: + # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting + - key: "intel.feature.node.kubernetes.io/gaudi" + operator: "Exists" + effect: "NoSchedule" affinity: {} + +nfd: + enabled: false # change to true to install NFD to the cluster + nameOverride: intel-gaudi-nfd + # TODO: this deprecated NFD option will be replaced in NFD v0.17 with "featureGates.NodeFeatureAPI" (added in v0.16): + # https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html#general-parameters + enableNodeFeatureApi: true