diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 503c0939..b4b7ba04 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -94,7 +94,9 @@ type DriverSpec struct { // +kubebuilder:default=true Enable *bool `json:"enable,omitempty"` - // blacklist amdgpu drivers on the host + // blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + // Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + // Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"} Blacklist *bool `json:"blacklist,omitempty"` @@ -115,6 +117,7 @@ type DriverSpec struct { // for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod // image tag will be in the format of --- // example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 + // NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"} // +optional // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` @@ -595,6 +598,7 @@ type ModuleStatus struct { LastTransitionTime string `json:"lastTransitionTime,omitempty"` Status UpgradeState `json:"status,omitempty"` UpgradeStartTime string `json:"upgradeStartTime,omitempty"` + BootId string `json:"bootId,omitempty"` } // DeviceConfigStatus defines the observed state of Module. diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 45078acb..09886fe0 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -29,12 +29,30 @@ metadata: } } ] - capabilities: Basic Install - createdAt: "2025-03-25T06:19:27Z" + capabilities: Seamless Upgrades + categories: AI/Machine Learning,Monitoring + containerImage: docker.io/rocm/gpu-operator:v1.2.0 + createdAt: "2025-04-10T00:25:51Z" + description: |- + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter + For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) + devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0 + nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest + operatorframework.io/cluster-monitoring: "true" operatorframework.io/suggested-namespace: openshift-amd-gpu + operators.openshift.io/valid-subscription: '[]' operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/ROCm/gpu-operator + support: Advanced Micro Devices, Inc. name: amd-gpu-operator.v1.2.0 namespace: placeholder spec: @@ -229,7 +247,10 @@ spec: path: driver.amdgpuInstallerRepoURL x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL - - description: blacklist amdgpu drivers on the host + - description: blacklist amdgpu drivers on the host. Node reboot is required + to apply the baclklist on the worker nodes. Not working for OpenShift cluster. + OpenShift users please use the Machine Config Operator (MCO) resource to + configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module displayName: BlacklistDrivers path: driver.blacklist x-descriptors: @@ -241,13 +262,15 @@ spec: path: driver.enable x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:enable - - description: defines image that includes drivers and firmware blobs, don't + - description: 'defines image that includes drivers and firmware blobs, don''t include tag since it will be fully managed by operator for vanilla k8s the default value is image-registry:5000/$MOD_NAMESPACE/amdgpu_kmod for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 - and ubuntu-22.04-5.15.0-94-generic-6.1.3 + and ubuntu-22.04-5.15.0-94-generic-6.1.3 NOTE: Updating the driver image + repository is not supported. Please delete the existing DeviceConfig and + create a new one with the updated image repository' displayName: Image path: driver.image x-descriptors: @@ -608,7 +631,7 @@ spec: - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus version: v1alpha1 description: |- - Operator responsible for deploying AMD GPU kernel drivers and device plugin + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) displayName: amd-gpu-operator icon: @@ -1112,11 +1135,24 @@ spec: - supported: true type: AllNamespaces keywords: - - amd-gpu-operator + - AMD + - GPU + - AI + - Deep Learning + - Hardware + - Driver + - Monitoring links: - - name: Amd Gpu Operator - url: https://amd-gpu-operator.domain - maturity: alpha + - name: AMD GPU Operator + url: https://github.com/ROCm/gpu-operator + maintainers: + - email: Yan.Sun3@amd.com + name: Yan Sun + - email: farshad.ghodsian@amd.com + name: Farshad Ghodsian + - email: shrey.ajmera@amd.com + name: Shrey Ajmera + maturity: stable provider: - name: amd-gpu-operator + name: Advanced Micro Devices, Inc. version: 1.2.0 diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index c9123ffe..8a439b8d 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -342,7 +342,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -357,6 +360,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 + NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: @@ -928,6 +932,8 @@ spec: description: ModuleStatus contains the status of driver module installed by operator on the node properties: + bootId: + type: string containerImage: type: string kernelVersion: diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 24c2b053..dfd71b78 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -338,7 +338,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -353,6 +356,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 + NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: @@ -924,6 +928,8 @@ spec: description: ModuleStatus contains the status of driver module installed by operator on the node properties: + bootId: + type: string containerImage: type: string kernelVersion: diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index a9f4d685..c49d9c30 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -3,9 +3,27 @@ kind: ClusterServiceVersion metadata: annotations: alm-examples: '[]' - capabilities: Basic Install + capabilities: Seamless Upgrades + categories: AI/Machine Learning,Monitoring + containerImage: docker.io/rocm/gpu-operator:v1.2.0 + description: |- + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter + For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) + devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0 + nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest + operatorframework.io/cluster-monitoring: "true" operatorframework.io/suggested-namespace: openshift-amd-gpu + operators.openshift.io/valid-subscription: '[]' repository: https://github.com/ROCm/gpu-operator + support: Advanced Micro Devices, Inc. name: amd-gpu-operator.v0.0.0 namespace: placeholder spec: @@ -200,7 +218,10 @@ spec: path: driver.amdgpuInstallerRepoURL x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL - - description: blacklist amdgpu drivers on the host + - description: blacklist amdgpu drivers on the host. Node reboot is required + to apply the baclklist on the worker nodes. Not working for OpenShift cluster. + OpenShift users please use the Machine Config Operator (MCO) resource to + configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module displayName: BlacklistDrivers path: driver.blacklist x-descriptors: @@ -212,13 +233,15 @@ spec: path: driver.enable x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:enable - - description: defines image that includes drivers and firmware blobs, don't + - description: 'defines image that includes drivers and firmware blobs, don''t include tag since it will be fully managed by operator for vanilla k8s the default value is image-registry:5000/$MOD_NAMESPACE/amdgpu_kmod for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 - and ubuntu-22.04-5.15.0-94-generic-6.1.3 + and ubuntu-22.04-5.15.0-94-generic-6.1.3 NOTE: Updating the driver image + repository is not supported. Please delete the existing DeviceConfig and + create a new one with the updated image repository' displayName: Image path: driver.image x-descriptors: @@ -579,7 +602,7 @@ spec: - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus version: v1alpha1 description: |- - Operator responsible for deploying AMD GPU kernel drivers and device plugin + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) displayName: amd-gpu-operator icon: @@ -599,11 +622,24 @@ spec: - supported: true type: AllNamespaces keywords: - - amd-gpu-operator + - AMD + - GPU + - AI + - Deep Learning + - Hardware + - Driver + - Monitoring links: - - name: Amd Gpu Operator - url: https://amd-gpu-operator.domain - maturity: alpha + - name: AMD GPU Operator + url: https://github.com/ROCm/gpu-operator + maintainers: + - email: Yan.Sun3@amd.com + name: Yan Sun + - email: farshad.ghodsian@amd.com + name: Farshad Ghodsian + - email: shrey.ajmera@amd.com + name: Shrey Ajmera + maturity: stable provider: - name: amd-gpu-operator + name: Advanced Micro Devices, Inc. version: 0.0.0 diff --git a/docs/conf.py b/docs/conf.py index c2086415..8fe2aaba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,21 +1,27 @@ """Configuration file for the Sphinx documentation builder.""" +import os +html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "instinct.docs.amd.com") +html_context = {} +if os.environ.get("READTHEDOCS", "") == "True": + html_context["READTHEDOCS"] = True external_projects_local_file = "projects.yaml" external_projects_remote_repository = "" external_projects = ["amd-gpu-operator"] external_projects_current_project = "amd-gpu-operator" -project = "AMD Instinct Documentation" +project = "AMD GPU Operator" version = "1.2.0" release = version -html_title = f"AMD GPU Operator {version}" +html_title = f"{project} {version}" author = "Advanced Micro Devices, Inc." -copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." +copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." # Required settings html_theme = "rocm_docs_theme" html_theme_options = { - "flavor": "instinct" + "flavor": "instinct", + "link_main_doc": True, # Add any additional theme options here } extensions = ["rocm_docs"] @@ -23,4 +29,4 @@ # Table of contents external_toc_path = "./sphinx/_toc.yml" -exclude_patterns = ['.venv'] +exclude_patterns = ['.venv'] \ No newline at end of file diff --git a/docs/device_plugin/device-plugin.md b/docs/device_plugin/device-plugin.md new file mode 100644 index 00000000..4ecfb97b --- /dev/null +++ b/docs/device_plugin/device-plugin.md @@ -0,0 +1,112 @@ +# Device Plugin + +## Configure device plugin + +To start the Device Plugin along with the GPU Operator configure fields under the ``` spec/devicePlugin ``` field in deviceconfig Custom Resource(CR) + +```yaml + devicePlugin: + # Specify the device plugin image + # default value is rocm/k8s-device-plugin:latest + devicePluginImage: rocm/k8s-device-plugin:latest + + # The device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + devicePluginArguments: + resource_naming_strategy: single + + # Specify the node labeller image + # default value is rocm/k8s-device-plugin:labeller-latest + nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest + + # Specify whether to bring up node labeller component + # default value is true + enableNodeLabeller: True + +``` + +The **device-plugin** pods start after updating the **DeviceConfig** CR + +```bash +#kubectl get pods -n kube-amd-gpu +NAME READY STATUS RESTARTS AGE +amd-gpu-operator-gpu-operator-charts-controller-manager-77tpmgn 1/1 Running 0 4h9m +amd-gpu-operator-kmm-controller-6d459dffcf-lbgtt 1/1 Running 0 4h9m +amd-gpu-operator-kmm-webhook-server-5fdc8b995-qgj49 1/1 Running 0 4h9m +amd-gpu-operator-node-feature-discovery-gc-78989c896-7lh8t 1/1 Running 0 3h48m +amd-gpu-operator-node-feature-discovery-master-b8bffc48b-6rnz6 1/1 Running 0 4h9m +amd-gpu-operator-node-feature-discovery-worker-m9lwn 1/1 Running 0 4h9m +test-deviceconfig-device-plugin-rk5f4 1/1 Running 0 134m +test-deviceconfig-node-labeller-bxk7x 1/1 Running 0 134m +``` + +
+Note: The Device Plugin name will be prefixed with the name of your DeviceConfig custom resource +

+ +## Device Plugin DeviceConfig +| Field Name | Details | +|----------------------------------|----------------------------------------------| +| **DevicePluginImage** | Device plugin image | +| **DevicePluginImagePullPolicy** | One of Always, Never, IfNotPresent. | +| **NodeLabellerImage** | Node labeller image | +| **NodeLabellerImagePullPolicy** | One of Always, Never, IfNotPresent. | +| **EnableNodeLabeller** | Enable/Disable node labeller with True/False | +| **DevicePluginArguments** | The flag/values to pass on to Device Plugin | +
+ +1. Both the `ImagePullPolicy` fields default to `Always` if `:latest` tag is specified on the respective Image, or defaults to `IfNotPresent` otherwise. This is default k8s behaviour for `ImagePullPolicy` + +2. `DevicePluginArguments` is of type `map[string]string`. Currently supported key value pairs to set under `DevicePluginArguments` are: + -> "resource_naming_strategy": {"single", "mixed"} + +## How to choose Resource Naming Strategy + +To customize the way device plugin reports gpu resources to kubernetes as allocatable k8s resources, use the `single` or `mixed` resource naming strategy in **DeviceConfig** CR +Before understanding each strategy, please note the definition of homogeneous and heterogeneous nodes + +Homogeneous node: A node whose gpu's follow the same compute-memory partition style + -> Example: A node of 8 GPU's where all 8 GPU's are following CPX-NPS4 partition style + +Heterogeneous node: A node whose gpu's follow different compute-memory partition styles + -> Example: A node of 8 GPU's where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1 + +### Single + +In `single` mode, the device plugin reports all gpu's (regardless of whether they are whole gpu's or partitions of a gpu) under the resource name `amd.com/gpu` +This mode is supported for homogeneous nodes but not supported for heterogeneous nodes + +A node which has 8 GPUs where all GPUs are not partitioned will report its resources as: + +```bash +amd.com/gpu: 8 +``` + +A node which has 8 GPUs where all GPUs are partitioned using CPX-NPS4 style will report its resources as: + +```bash +amd.com/gpu: 64 +``` + +### Mixed + +In `mixed` mode, the device plugin reports all gpu's under a name which matches its partition style. +This mode is supported for both homogeneous nodes and heterogeneous nodes + +A node which has 8 GPUs which are all partitioned using CPX-NPS4 style will report its resources as: + +```bash +amd.com/cpx_nps4: 64 +``` + +A node which has 8 GPUs where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1 will report its resources as: + +```bash +amd.com/spx_nps1: 5 +amd.com/cpx_nps1: 24 +``` + +#### **Notes** + +- If `resource_naming_strategy` is not passed using `DevicePluginArguments` field in CR, then device plugin will internally default to `single` resource naming strategy. This maintains backwards compatibility with earlier release of device plugin with reported resource name of `amd.com/gpu` +- If a node has GPUs which do not support partitioning, such as MI210, then the GPUs are reported under resource name `amd.com/gpu` regardless of the resource naming strategy +- These different naming styles of resources, for example, `amd.com/cpx_nps1` should be followed when requesting for resources in a pod spec \ No newline at end of file diff --git a/docs/drivers/installation.md b/docs/drivers/installation.md index 890da553..9825e546 100644 --- a/docs/drivers/installation.md +++ b/docs/drivers/installation.md @@ -96,9 +96,13 @@ spec: # enable operator to install out-of-tree amdgpu kernel module enable: true # blacklist is required for installing out-of-tree amdgpu kernel module + # Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + # Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olmhtml#create-blacklist-for-installing-out-of-tree-kernel-module blacklist: true # Specify your repository to host driver image - # DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you + # Note: + # 1. DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you + # 2. Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository image: docker.io/username/repo # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access # you can create the docker-registry type secret by running command like: diff --git a/docs/fulldeviceconfig.rst b/docs/fulldeviceconfig.rst index 8d8c1d95..9f7b8441 100644 --- a/docs/fulldeviceconfig.rst +++ b/docs/fulldeviceconfig.rst @@ -38,10 +38,15 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM driver: # Set to false to skip driver installation to use inbox or pre-installed driver on worker nodes # Set to true to enable operator to install out-of-tree amdgpu kernel module - enable: false - blacklist: false # Set to true to blacklist the amdgpu kernel module which is required for installing out-of-tree driver + enable: false + # Set to true to blacklist the amdgpu kernel module which is required for installing out-of-tree driver + # Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + # Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module + blacklist: false # Specify your repository to host driver image - # DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you + # Note: + # 1. DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you + # 2. Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository image: docker.io/username/repo # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access # you can create the docker-registry type secret by running command like: diff --git a/docs/installation/kubernetes-helm.md b/docs/installation/kubernetes-helm.md index c1415324..8681222f 100644 --- a/docs/installation/kubernetes-helm.md +++ b/docs/installation/kubernetes-helm.md @@ -163,6 +163,10 @@ The following parameters are able to be configued when using the Helm Chart. In | controllerManager.manager.image.tag | string | `"v1.2.0"` | AMD GPU operator controller manager image tag | | controllerManager.manager.imagePullPolicy | string | `"Always"` | Image pull policy for AMD GPU operator controller manager pod | | controllerManager.manager.imagePullSecrets | string | `""` | Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image | +| controllerManager.manager.resources.limits.cpu | string | `"1000m"` | CPU limits for the controller manager. Consider increasing for large clusters | +| controllerManager.manager.resources.limits.memory | string | `"1Gi"` | Memory limits for the controller manager. Consider increasing if experiencing OOM issues | +| controllerManager.manager.resources.requests.cpu | string | `"100m"` | CPU requests for the controller manager. Adjust based on observed CPU usage | +| controllerManager.manager.resources.requests.memory | string | `"256Mi"` | Memory requests for the controller manager. Adjust based on observed memory usage | | controllerManager.nodeSelector | object | `{}` | Node selector for AMD GPU operator controller manager deployment | | installdefaultNFDRule | bool | `true` | Default NFD rule will detect amd gpu based on pci vendor ID | | kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator | @@ -258,6 +262,42 @@ Verify that nodes with AMD GPU hardware are properly labeled: kubectl get nodes -L feature.node.kubernetes.io/amd-gpu ``` +## Resource Configuration + +### Controller Manager Resource Settings + +The AMD GPU Operator controller manager component has default resource limits and requests configured for typical usage scenarios. You may need to adjust these values based on your specific cluster environment: + +```yaml +controllerManager: + manager: + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 100m + memory: 256Mi +``` + +#### When to Adjust Resource Settings + +You should consider adjusting the controller manager resource settings in these scenarios: + +- **Large clusters**: If managing a large number of nodes or GPU devices, consider increasing both CPU and memory limits +- **Memory pressure**: If you observe OOM (Out of Memory) kills in controller manager pods, increase the memory limit and request +- **CPU pressure**: If the controller manager is experiencing throttling or slow response times during operations, increase the CPU limit and request +- **Resource-constrained environments**: For smaller development or test clusters, you may reduce these values to conserve resources + +You can apply resource changes by updating your values.yaml file and upgrading the Helm release: + +```bash +helm upgrade amd-gpu-operator amd/gpu-operator-helm \ + --namespace kube-amd-gpu \ + --version=v1.0.0 \ + -f values.yaml +``` + ## Install Custom Resource After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](../drivers/installation). Here are some examples for common deployment scenarios. diff --git a/docs/knownlimitations.md b/docs/knownlimitations.md index 051b48cd..a21fc6d3 100644 --- a/docs/knownlimitations.md +++ b/docs/knownlimitations.md @@ -85,6 +85,20 @@ - **Recommendation:** Ensure nodes are fully stable before triggering an upgrade, and if necessary, manually update node labels to enforce the new driver version. Refer to driver upgrade documentation for more details.

+13. **Driver Upgrade Issue when maxParallel Upgrades is equal to total number of worker nodes in Red Hat OpenShift** + + - **Impact:** Not able to perform driver upgrade + - **Affected Configurations:** This issue only affects Red Hat OpenShift when Image registry pod is running on one of the worker nodes or kmm build pod is required to be run on one of the worker nodes + - **Recommendation:** Please set maxParallel Upgrades to a number less than total number of worker nodes +

+ +14. **Driver Install/Upgrade Issue if one of the nodes where KMM is running build pod gets rebooted accidentaly when rebootRequired is set to false** + + - **Impact:** Not able to perform driver install/upgrade + - **Affected Configurations:** All configurations + - **Recommendation:** Please retrigger driver install/upgrade and ensure to not reboot node manually when rebootRequired is false +

+ ## Fixed Issues 1. **When GPU Operator is installed with Exporter enabled, upgrade of driver is blocked as exporter is actively using the amdgpu module (Fixed in v1.2.0)** diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 78600aa6..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -rocm-docs-core diff --git a/docs/sphinx/_toc.yml b/docs/sphinx/_toc.yml index a232e7ab..62786ea4 100644 --- a/docs/sphinx/_toc.yml +++ b/docs/sphinx/_toc.yml @@ -44,6 +44,9 @@ subtrees: - file: test/manual-test - file: test/pre-start-job-test - file: test/appendix-test-recipe + - caption: Device Plugin + entries: + - file: device_plugin/device-plugin - caption: Specialized Networks entries: - file: specialized_networks/airgapped-install diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a232e7ab..62786ea4 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -44,6 +44,9 @@ subtrees: - file: test/manual-test - file: test/pre-start-job-test - file: test/appendix-test-recipe + - caption: Device Plugin + entries: + - file: device_plugin/device-plugin - caption: Specialized Networks entries: - file: specialized_networks/airgapped-install diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 5efe4f66..e75ed236 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.17.1 -sphinx-reredirects +rocm-docs-core==1.18.1 +sphinx-reredirects \ No newline at end of file diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index fc912bea..cf7a01fd 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -15,38 +15,39 @@ attrs==25.1.0 # jsonschema # jupyter-cache # referencing -babel==2.17.0 +babel==2.16.0 # via # pydata-sphinx-theme # sphinx -beautifulsoup4==4.13.3 +beautifulsoup4==4.12.3 # via pydata-sphinx-theme -breathe==4.36.0 +breathe==4.35.0 # via rocm-docs-core -certifi==2025.1.31 +certifi==2024.8.30 # via requests cffi==1.17.1 # via # cryptography # pynacl -charset-normalizer==3.4.1 +charset-normalizer==3.4.0 # via requests -click==8.1.8 +click==8.1.7 # via # jupyter-cache # sphinx-external-toc comm==0.2.2 # via ipykernel -cryptography==44.0.2 +cryptography==43.0.3 # via pyjwt -debugpy==1.8.13 +debugpy==1.8.12 # via ipykernel -decorator==5.2.1 +decorator==5.1.1 # via ipython -deprecated==1.2.18 +deprecated==1.2.15 # via pygithub docutils==0.21.2 # via + # breathe # myst-parser # pydata-sphinx-theme # sphinx @@ -54,13 +55,13 @@ exceptiongroup==1.2.2 # via ipython executing==2.2.0 # via stack-data -fastjsonschema==2.21.1 +fastjsonschema==2.20.0 # via # nbformat # rocm-docs-core -gitdb==4.0.12 +gitdb==4.0.11 # via gitpython -gitpython==3.1.44 +gitpython==3.1.43 # via rocm-docs-core greenlet==3.1.1 # via sqlalchemy @@ -74,13 +75,13 @@ importlib-metadata==8.6.1 # myst-nb ipykernel==6.29.5 # via myst-nb -ipython==8.33.0 +ipython==8.31.0 # via # ipykernel # myst-nb jedi==0.19.2 # via ipython -jinja2==3.1.6 +jinja2==3.1.4 # via # myst-parser # sphinx @@ -114,9 +115,9 @@ mdit-py-plugins==0.4.2 # via myst-parser mdurl==0.1.2 # via markdown-it-py -myst-nb==1.2.0 +myst-nb==1.1.2 # via rocm-docs-core -myst-parser==4.0.1 +myst-parser==4.0.0 # via myst-nb nbclient==0.10.2 # via @@ -132,7 +133,6 @@ nest-asyncio==1.6.0 packaging==24.2 # via # ipykernel - # pydata-sphinx-theme # sphinx parso==0.8.4 # via jedi @@ -142,7 +142,7 @@ platformdirs==4.3.6 # via jupyter-core prompt-toolkit==3.0.50 # via ipython -psutil==7.0.0 +psutil==6.1.1 # via ipykernel ptyprocess==0.7.0 # via pexpect @@ -150,19 +150,19 @@ pure-eval==0.2.3 # via stack-data pycparser==2.22 # via cffi -pydata-sphinx-theme==0.15.4 +pydata-sphinx-theme==0.16.0 # via # rocm-docs-core # sphinx-book-theme -pygithub==2.6.1 +pygithub==2.5.0 # via rocm-docs-core -pygments==2.19.1 +pygments==2.18.0 # via # accessible-pygments # ipython # pydata-sphinx-theme # sphinx -pyjwt[crypto]==2.10.1 +pyjwt[crypto]==2.10.0 # via pygithub pynacl==1.5.0 # via pygithub @@ -187,15 +187,15 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.17.1 +rocm-docs-core==1.18.1 # via -r requirements.in -rpds-py==0.23.1 +rpds-py==0.22.3 # via # jsonschema # referencing six==1.17.0 # via python-dateutil -smmap==5.0.2 +smmap==5.0.1 # via gitdb snowballstemmer==2.2.0 # via sphinx @@ -214,7 +214,7 @@ sphinx==8.1.3 # sphinx-external-toc # sphinx-notfound-page # sphinx-reredirects -sphinx-book-theme==1.1.4 +sphinx-book-theme==1.1.3 # via rocm-docs-core sphinx-copybutton==0.5.2 # via rocm-docs-core @@ -222,7 +222,7 @@ sphinx-design==0.6.1 # via rocm-docs-core sphinx-external-toc==1.0.1 # via rocm-docs-core -sphinx-notfound-page==1.1.0 +sphinx-notfound-page==1.0.4 # via rocm-docs-core sphinx-reredirects==0.1.5 # via -r requirements.in @@ -238,13 +238,13 @@ sphinxcontrib-qthelp==2.0.0 # via sphinx sphinxcontrib-serializinghtml==2.0.0 # via sphinx -sqlalchemy==2.0.38 +sqlalchemy==2.0.37 # via jupyter-cache stack-data==0.6.3 # via ipython tabulate==0.9.0 # via jupyter-cache -tomli==2.2.1 +tomli==2.1.0 # via sphinx tornado==6.4.2 # via @@ -262,20 +262,19 @@ traitlets==5.14.3 # nbformat typing-extensions==4.12.2 # via - # beautifulsoup4 # ipython # myst-nb # pydata-sphinx-theme # pygithub # referencing # sqlalchemy -urllib3==2.3.0 +urllib3==2.2.3 # via # pygithub # requests wcwidth==0.2.13 # via prompt-toolkit -wrapt==1.17.2 +wrapt==1.17.0 # via deprecated zipp==3.21.0 - # via importlib-metadata + # via importlib-metadata \ No newline at end of file diff --git a/docs/test/auto-unhealthy-device-test.md b/docs/test/auto-unhealthy-device-test.md index 0b6e9cb3..c610a32c 100644 --- a/docs/test/auto-unhealthy-device-test.md +++ b/docs/test/auto-unhealthy-device-test.md @@ -4,6 +4,10 @@ Test runner is periodically watching for the device health status from device metrics exporter per 30 seconds. Once exporter reported GPU status is unhealthy, test runner will start to run one-time test on the unhealthy GPU. The test result will be exported as Kubernetes event. +```{warning} +The RVS test recipes in the Test Runner aren't compatible with partitioned GPUs. To address this, either disable the test runner by setting ```spec/testRunner/enable``` to ```false```, or configure the test runner to run only on nodes without partitioned GPUs by using ```spec/testRunner/selector```. +``` + ## Configure test runner To start the Test Runner along with the GPU Operator, Device Metrics Exporter must be enabled since Test Runner is depending on the exported health status. Configure the ``` spec/metricsExporter/enable ``` field in deviceconfig Custom Resource(CR) to enable/disable metrics exporter and configure the ``` spec/testRunner/enable ``` field in deviceconfig Custom Resource(CR) to enable/disable test runner. diff --git a/docs/test/manual-test.md b/docs/test/manual-test.md index c00ac288..c4ba4bae 100644 --- a/docs/test/manual-test.md +++ b/docs/test/manual-test.md @@ -4,6 +4,10 @@ To start the manual test, directly use the test runner image to create the Kubernetes job and related resources, then the test will be triggered manually. +```{warning} +The RVS test recipes in the Test Runner are not compatible with partitioned GPUs. If you are using a partitioned GPU, please reset the GPU partition configuration and conduct the manual test on a non-partitioned GPU. +``` + ## Use Case 1 - GPU is unhealthy on the node When any GPU on a specific worker node is unhealthy, you can manually trigger a test / benchmark run on that worker node to check more details on the unhealthy state. The test job requires RBAC config to grant the test runner access to export events and add node labels to the cluster. Here is an example of configuring the RBAC and Job resources: diff --git a/docs/test/pre-start-job-test.md b/docs/test/pre-start-job-test.md index 2bad5332..a376ba73 100644 --- a/docs/test/pre-start-job-test.md +++ b/docs/test/pre-start-job-test.md @@ -4,6 +4,18 @@ Test runner can be embedded as an init container within your Kubernetes workload pod definition. The init container will be executed before the actual workload containers start, in that way the system could be tested right before the workload start to use the hardware resource. +```{warning} +The RVS test recipes in the Test Runner are not compatible with partitioned GPUs. If you are using a partitioned GPU, avoid running the Test Runner as an init container for the pre-start job test. +``` + +```{warning} +* Known Issue: Within a pod, the initContainer and workload container might not be assigned the same GPUs. + +* Workaround: The example in this document remains applicable if both initContainer and workload containers request all GPUs on the same node. + +* Future Solution: With the introduction of [Dynamic Resource Allocation](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/), both initContainer and workload container will be able to share the same set of GPUs. +``` + ## Configure pre-start init container The init container requires RBAC config to grant the pod access to export events and add node labels to the cluster. Here is an example of configuring the RBAC and Job resources: @@ -73,8 +85,8 @@ spec: image: docker.io/rocm/test-runner:v1.2.0-beta.0 imagePullPolicy: IfNotPresent resources: - limits: - amd.com/gpu: 1 # requesting a GPU + requests: + amd.com/gpu: 8 # requesting all GPUs on the worker node env: - name: TEST_TRIGGER value: "PRE_START_JOB_CHECK" # Set the TEST_TRIGGER environment variable to PRE_START_JOB_CHECK for test runner as init container @@ -96,8 +108,8 @@ spec: command: ["/bin/sh", "-c", "--"] args: ["sleep 6000"] resources: - limits: - amd.com/gpu: 1 # requesting a GPU + requests: + amd.com/gpu: 8 # requesting all GPUs on the worker node ``` ## Check test runner init container diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 71bfd56c..6e6e0a0d 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -47,11 +47,11 @@ controllerManager: effect: "NoSchedule" resources: limits: - cpu: 500m - memory: 384Mi + cpu: 1000m + memory: 1Gi requests: - cpu: 10m - memory: 64Mi + cpu: 100m + memory: 256Mi # -- Node selector for AMD GPU operator controller manager deployment nodeSelector: {} # -- Deployment affinity configs for controller manager diff --git a/hack/openshift-patch/metadata-patch/values.yaml b/hack/openshift-patch/metadata-patch/values.yaml index b0b937a9..2bdb27ad 100644 --- a/hack/openshift-patch/metadata-patch/values.yaml +++ b/hack/openshift-patch/metadata-patch/values.yaml @@ -26,11 +26,11 @@ controllerManager: effect: "NoSchedule" resources: limits: - cpu: 500m - memory: 384Mi + cpu: 1000m + memory: 1Gi requests: - cpu: 10m - memory: 64Mi + cpu: 100m + memory: 256Mi nodeSelector: {} affinity: nodeAffinity: diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index 54b4cb8c..95811e74 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 -generated: "2025-03-25T06:19:17.248998622Z" +generated: "2025-04-10T00:25:36.698574082Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 502f4b89..24669303 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -346,7 +346,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -361,6 +364,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 + NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: @@ -929,6 +933,8 @@ spec: description: ModuleStatus contains the status of driver module installed by operator on the node properties: + bootId: + type: string containerImage: type: string kernelVersion: diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 71bfd56c..6e6e0a0d 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -47,11 +47,11 @@ controllerManager: effect: "NoSchedule" resources: limits: - cpu: 500m - memory: 384Mi + cpu: 1000m + memory: 1Gi requests: - cpu: 10m - memory: 64Mi + cpu: 100m + memory: 256Mi # -- Node selector for AMD GPU operator controller manager deployment nodeSelector: {} # -- Deployment affinity configs for controller manager diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 6e9b718d..ea8bd255 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-25T06:19:26.060856628Z" +generated: "2025-04-10T00:25:48.698223085Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 502f4b89..24669303 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -346,7 +346,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -361,6 +364,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 + NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: @@ -929,6 +933,8 @@ spec: description: ModuleStatus contains the status of driver module installed by operator on the node properties: + bootId: + type: string containerImage: type: string kernelVersion: diff --git a/helm-charts-openshift/values.yaml b/helm-charts-openshift/values.yaml index b0b937a9..2bdb27ad 100644 --- a/helm-charts-openshift/values.yaml +++ b/helm-charts-openshift/values.yaml @@ -26,11 +26,11 @@ controllerManager: effect: "NoSchedule" resources: limits: - cpu: 500m - memory: 384Mi + cpu: 1000m + memory: 1Gi requests: - cpu: 10m - memory: 64Mi + cpu: 100m + memory: 256Mi nodeSelector: {} affinity: nodeAffinity: diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index 2e782fb5..7486a8b7 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -593,9 +593,11 @@ func (dcrh *deviceConfigReconcilerHelper) getDeviceConfigOwnedKMMModule(ctx cont func (dcrh *deviceConfigReconcilerHelper) updateDeviceConfigNodeStatus(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { logger := log.FromContext(ctx) previousUpgradeTimes := make(map[string]string) + previousBootIds := make(map[string]string) // Persist the UpgradeStartTime for nodeName, moduleStatus := range devConfig.Status.NodeModuleStatus { previousUpgradeTimes[nodeName] = moduleStatus.UpgradeStartTime + previousBootIds[nodeName] = moduleStatus.BootId } devConfig.Status.NodeModuleStatus = map[string]amdv1alpha1.ModuleStatus{} @@ -610,7 +612,12 @@ func (dcrh *deviceConfigReconcilerHelper) updateDeviceConfigNodeStatus(ctx conte if upgradeStartTime == "" { upgradeStartTime = previousUpgradeTimes[node.Name] } - devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name), UpgradeStartTime: upgradeStartTime} + bootId := dcrh.upgradeMgrHandler.GetNodeBootId(node.Name) + //If operator restarted during Upgrade, then fetch previous known bootId since the internal maps would have been cleared + if bootId == "" { + bootId = previousBootIds[node.Name] + } + devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name), UpgradeStartTime: upgradeStartTime, BootId: bootId} nmc := kmmv1beta1.NodeModulesConfig{} err := dcrh.client.Get(ctx, types.NamespacedName{Name: node.Name}, &nmc) @@ -632,6 +639,7 @@ func (dcrh *deviceConfigReconcilerHelper) updateDeviceConfigNodeStatus(ctx conte LastTransitionTime: module.LastTransitionTime.String(), Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name), UpgradeStartTime: upgradeStartTime, + BootId: bootId, } } } diff --git a/internal/controllers/mock_upgrademgr.go b/internal/controllers/mock_upgrademgr.go index 7db0fa9c..33e8332e 100644 --- a/internal/controllers/mock_upgrademgr.go +++ b/internal/controllers/mock_upgrademgr.go @@ -57,6 +57,20 @@ func (m *MockupgradeMgrAPI) EXPECT() *MockupgradeMgrAPIMockRecorder { return m.recorder } +// GetNodeBootId mocks base method. +func (m *MockupgradeMgrAPI) GetNodeBootId(nodeName string) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetNodeBootId", nodeName) + ret0, _ := ret[0].(string) + return ret0 +} + +// GetNodeBootId indicates an expected call of GetNodeBootId. +func (mr *MockupgradeMgrAPIMockRecorder) GetNodeBootId(nodeName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNodeBootId", reflect.TypeOf((*MockupgradeMgrAPI)(nil).GetNodeBootId), nodeName) +} + // GetNodeStatus mocks base method. func (m *MockupgradeMgrAPI) GetNodeStatus(nodeName string) v1alpha1.UpgradeState { m.ctrl.T.Helper() @@ -216,6 +230,20 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) deleteRebootPod(ctx, nodeName, dc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteRebootPod", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).deleteRebootPod), ctx, nodeName, dc, force, genId) } +// getBootID mocks base method. +func (m *MockupgradeMgrHelperAPI) getBootID(nodeName string) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getBootID", nodeName) + ret0, _ := ret[0].(string) + return ret0 +} + +// getBootID indicates an expected call of getBootID. +func (mr *MockupgradeMgrHelperAPIMockRecorder) getBootID(nodeName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getBootID", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).getBootID), nodeName) +} + // getNode mocks base method. func (m *MockupgradeMgrHelperAPI) getNode(ctx context.Context, nodeName string) (*v1.Node, error) { m.ctrl.T.Helper() @@ -366,6 +394,20 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) isNodeNew(ctx, node, deviceConfig return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isNodeNew", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).isNodeNew), ctx, node, deviceConfig) } +// isNodeNmcStatusMissing mocks base method. +func (m *MockupgradeMgrHelperAPI) isNodeNmcStatusMissing(ctx context.Context, node *v1.Node, deviceConfig *v1alpha1.DeviceConfig) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "isNodeNmcStatusMissing", ctx, node, deviceConfig) + ret0, _ := ret[0].(bool) + return ret0 +} + +// isNodeNmcStatusMissing indicates an expected call of isNodeNmcStatusMissing. +func (mr *MockupgradeMgrHelperAPIMockRecorder) isNodeNmcStatusMissing(ctx, node, deviceConfig any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isNodeNmcStatusMissing", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).isNodeNmcStatusMissing), ctx, node, deviceConfig) +} + // isNodeReady mocks base method. func (m *MockupgradeMgrHelperAPI) isNodeReady(ctx context.Context, node *v1.Node, deviceConfig *v1alpha1.DeviceConfig) bool { m.ctrl.T.Helper() @@ -465,6 +507,18 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) isUpgradePolicyViolated(upgradeIn return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isUpgradePolicyViolated", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).isUpgradePolicyViolated), upgradeInProgress, upgradeFailedState, totalNodes, deviceConfig) } +// setBootID mocks base method. +func (m *MockupgradeMgrHelperAPI) setBootID(nodeName, bootID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "setBootID", nodeName, bootID) +} + +// setBootID indicates an expected call of setBootID. +func (mr *MockupgradeMgrHelperAPIMockRecorder) setBootID(nodeName, bootID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "setBootID", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).setBootID), nodeName, bootID) +} + // setNodeStatus mocks base method. func (m *MockupgradeMgrHelperAPI) setNodeStatus(ctx context.Context, nodeName string, status v1alpha1.UpgradeState) { m.ctrl.T.Helper() diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go index a5e519b2..2407dbe2 100644 --- a/internal/controllers/upgrademgr.go +++ b/internal/controllers/upgrademgr.go @@ -64,6 +64,25 @@ const ( defaultSAName = "amd-gpu-operator-utils-container" ) +var ( + computePartitionTypes = []string{"spx", "cpx", "dpx", "qpx", "tpx"} + memoryPartitionTypes = []string{"nps1", "nps4"} + validResources = buildValidResources() +) + +func buildValidResources() map[string]struct{} { + resources := map[string]struct{}{ + "amd.com/gpu": {}, + } + for _, compute := range computePartitionTypes { + for _, memory := range memoryPartitionTypes { + resourceName := fmt.Sprintf("amd.com/%s_%s", compute, memory) + resources[resourceName] = struct{}{} + } + } + return resources +} + type upgradeMgr struct { helper upgradeMgrHelperAPI } @@ -74,6 +93,7 @@ type upgradeMgrAPI interface { HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) (ctrl.Result, error) GetNodeStatus(nodeName string) amdv1alpha1.UpgradeState GetNodeUpgradeStartTime(nodeName string) string + GetNodeBootId(nodeName string) string } func newUpgradeMgrHandler(client client.Client, k8sConfig *rest.Config) upgradeMgrAPI { @@ -108,16 +128,25 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha if deviceConfig.Spec.Driver.UpgradePolicy.RebootRequired != nil && *deviceConfig.Spec.Driver.UpgradePolicy.RebootRequired { nodeObj, err := n.helper.getNode(ctx, nodeName) if err == nil { - log.FromContext(ctx).Info("Reboot is required for driver upgrade, triggering node reboot") - n.helper.handleNodeReboot(ctx, nodeObj, deviceConfig) + // trigger reboot only for nodes which are in UpgradeStarted but haven't rebooted yet + if nodeObj.Status.NodeInfo.BootID == moduleStatus.BootId { + log.FromContext(ctx).Info(fmt.Sprintf("Node: %v: Reboot is required for driver upgrade, triggering node reboot", nodeName)) + n.helper.handleNodeReboot(ctx, nodeObj, deviceConfig) + // for nodes which are in UpgradeStarted but already rebooted. Schedule the reboot pod deletion + } else { + currentBootID := nodeObj.Status.NodeInfo.BootID + n.helper.setBootID(nodeObj.Name, currentBootID) + log.FromContext(ctx).Info(fmt.Sprintf("Node: %v: Node already rebooted, scheduling reboot pod deletion", nodeName)) + go n.helper.deleteRebootPod(ctx, nodeName, deviceConfig, false, deviceConfig.Generation) + } } } else { - log.FromContext(ctx).Info("Resetting Upgrade State to UpgradeStateEmpty") + log.FromContext(ctx).Info(fmt.Sprintf("Node: %v: Resetting Upgrade State to UpgradeStateEmpty", nodeName)) n.helper.setNodeStatus(ctx, nodeName, amdv1alpha1.UpgradeStateEmpty) } } else if moduleStatus.Status == amdv1alpha1.UpgradeStateRebootInProgress { // Operator restarted during upgrade operation. Schedule the reboot pod deletion - log.FromContext(ctx).Info("Reboot is in progress, scheduling reboot pod deletion") + log.FromContext(ctx).Info(fmt.Sprintf("Node: %v: Reboot is in progress, scheduling reboot pod deletion", nodeName)) n.helper.setNodeStatus(ctx, nodeName, moduleStatus.Status) go n.helper.deleteRebootPod(ctx, nodeName, deviceConfig, false, deviceConfig.Generation) } else { @@ -158,6 +187,12 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha continue } + // Untaint to let upgrade continue in case of KMM bug after node reboot + if n.helper.isNodeNmcStatusMissing(ctx, &nodeList.Items[i], deviceConfig) { + upgradeInProgress++ + continue + } + // 3. Handle Started Nodes if n.helper.isNodeStateUpgradeStarted(&nodeList.Items[i]) { upgradeInProgress++ @@ -244,11 +279,16 @@ func (n *upgradeMgr) GetNodeStatus(nodeName string) (status amdv1alpha1.UpgradeS return n.helper.getNodeStatus(nodeName) } -// GetNodeStaGetNodeUpgradeStartTimetus returns the time when upgrade started on the node +// GetNodeUpgradeStartTime returns the time when upgrade started on the node func (n *upgradeMgr) GetNodeUpgradeStartTime(nodeName string) string { return n.helper.getUpgradeStartTime(nodeName) } +// GetNodeBootId returns the last known bootid of the node +func (n *upgradeMgr) GetNodeBootId(nodeName string) string { + return n.helper.getBootID(nodeName) +} + /*=========================================== Upgrade Manager Helper APIs ==========================================*/ //go:generate mockgen -source=upgrademgr.go -package=controllers -destination=mock_upgrademgr.go upgradeMgrHelperAPI @@ -258,6 +298,7 @@ type upgradeMgrHelperAPI interface { // Handle node state transitions isNodeReady(ctx context.Context, node *v1.Node, deviceConfig *amdv1alpha1.DeviceConfig) bool + isNodeNmcStatusMissing(ctx context.Context, node *v1.Node, deviceConfig *amdv1alpha1.DeviceConfig) bool isNodeNew(ctx context.Context, node *v1.Node, deviceConfig *amdv1alpha1.DeviceConfig) bool isNodeStateUpgradeStarted(node *v1.Node) bool isNodeStateInstallInProgress(ctx context.Context, node *v1.Node, deviceConfig *amdv1alpha1.DeviceConfig) bool @@ -287,6 +328,8 @@ type upgradeMgrHelperAPI interface { setUpgradeStartTime(nodeName string) clearUpgradeStartTime(nodeName string) checkUpgradeTimeExceeded(ctx context.Context, nodeName string, deviceConfig *amdv1alpha1.DeviceConfig) bool + getBootID(nodeName string) string + setBootID(nodeName string, bootID string) clearNodeStatus() isInit() bool } @@ -297,6 +340,7 @@ type upgradeMgrHelper struct { drainHelper *drain.Helper nodeStatus *sync.Map nodeUpgradeStartTime *sync.Map + nodeBootID *sync.Map init bool currentSpec driverSpec } @@ -313,6 +357,7 @@ func newUpgradeMgrHelperHandler(client client.Client, k8sInterface kubernetes.In k8sInterface: k8sInterface, nodeStatus: new(sync.Map), nodeUpgradeStartTime: new(sync.Map), + nodeBootID: new(sync.Map), } } @@ -367,6 +412,31 @@ func (h *upgradeMgrHelper) isNodeNew(ctx context.Context, node *v1.Node, deviceC return false } +// Handle Driver installation for nodes with nmc status missing +func (h *upgradeMgrHelper) isNodeNmcStatusMissing(ctx context.Context, node *v1.Node, deviceConfig *amdv1alpha1.DeviceConfig) bool { + + if nodeStatus, ok := deviceConfig.Status.NodeModuleStatus[node.Name]; ok { + currentState := h.getNodeStatus(node.Name) + // during the automatic upgrade, if node reboot was triggered, KMM could possibly remove the NMC status, making the ContainerImage empty + // https://github.com/rh-ecosystem-edge/kernel-module-management/blob/b57037ec1b8ceef9961ca1baeb9529121c6df398/internal/controllers/nmc_reconciler.go#L414-L419 + // at this moment the node status would be UpgradeStateInProgress with empty ContainerImage + // we still need to proceed with this status + if nodeStatus.ContainerImage == "" && currentState == amdv1alpha1.UpgradeStateInProgress { + + // Uncordon the node + if err := h.cordonOrUncordonNode(ctx, deviceConfig, node, false); err != nil { + // Move to failure state if uncordon fails + h.setNodeStatus(ctx, node.Name, amdv1alpha1.UpgradeStateUncordonFailed) + return false + } + + return true + } + } + + return false +} + // Handle Driver installation for ready nodes. func (h *upgradeMgrHelper) isNodeReady(ctx context.Context, node *v1.Node, deviceConfig *amdv1alpha1.DeviceConfig) bool { @@ -478,7 +548,23 @@ func (h *upgradeMgrHelper) isUpgradePolicyViolated(upgradeInProgress int, upgrad return maxParallelUpdates, true } - return maxParallelUpdates, (upgradeInProgress >= maxParallelUpdates) || (upgradeFailedState >= maxUnavailableNodes) + // Remaining space for unavailable nodes + remainingUnavailable := maxUnavailableNodes - upgradeFailedState + + var maxParallelAllowed int + if maxParallelUpdates == 0 { + // "0 means Unlimited parallel" — so allow up to remaining unavailable + maxParallelAllowed = remainingUnavailable + } else { + // Take into consideration minimum between configured value and remaining unavailable + maxParallelAllowed = min(maxParallelUpdates, remainingUnavailable) + } + + if maxParallelAllowed == 0 || upgradeInProgress >= maxParallelAllowed { + return maxParallelAllowed, true + } + + return maxParallelAllowed, false } @@ -527,6 +613,18 @@ func (h *upgradeMgrHelper) checkUpgradeTimeExceeded(ctx context.Context, nodeNam return false } +func (h *upgradeMgrHelper) getBootID(nodeName string) string { + if value, ok := h.nodeBootID.Load(nodeName); ok { + return value.(string) + } + + return "" +} + +func (h *upgradeMgrHelper) setBootID(nodeName string, currentbootID string) { + h.nodeBootID.Store(nodeName, currentbootID) +} + func (h *upgradeMgrHelper) getNodeStatus(nodeName string) amdv1alpha1.UpgradeState { if value, ok := h.nodeStatus.Load(nodeName); ok { @@ -632,9 +730,11 @@ func (h *upgradeMgrHelper) getPodsToDrainOrDelete(ctx context.Context, deviceCon continue } for _, container := range pod.Spec.Containers { - if _, ok := container.Resources.Requests["amd.com/gpu"]; ok { - newPods = append(newPods, pod) - break + for resourceName := range container.Resources.Requests { + if _, ok := validResources[string(resourceName)]; ok { + newPods = append(newPods, pod) + break + } } } } @@ -867,6 +967,8 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node, // Wait for the driver upgrade to complete waitForDriverUpgrade() + currentBootID := node.Status.NodeInfo.BootID + h.setBootID(node.Name, currentBootID) if err := h.client.Create(ctx, rebootPod); err != nil { logger.Error(err, fmt.Sprintf("Node: %v State: %v RebootPod Create failed with Error: %v", node.Name, h.getNodeStatus(node.Name), err)) // Mark the state as failed @@ -888,6 +990,11 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node, } } + if nodeObj.Status.NodeInfo.BootID != h.getBootID(node.Name) { + h.setBootID(node.Name, nodeObj.Status.NodeInfo.BootID) + logger.Info(fmt.Sprintf("Node: %v has rebooted", node.Name)) + return + } // If node is NotReady, proceed; otherwise, wait for the next tick if nodeNotReady { logger.Info(fmt.Sprintf("Node: %v has moved to NotReady", node.Name)) diff --git a/internal/metricsexporter/metricsexporter.go b/internal/metricsexporter/metricsexporter.go index 7ead7bca..c57341fb 100644 --- a/internal/metricsexporter/metricsexporter.go +++ b/internal/metricsexporter/metricsexporter.go @@ -240,7 +240,7 @@ func (nl *metricsExporter) SetMetricsExporterAsDesired(ds *appsv1.DaemonSet, dev if internalPort == port { internalPort = port - 1 } - // Bind service port to localhost only + // Bind service port to localhost only, don't expose port in ContainerPort containers[0].Args = []string{"--bind=127.0.0.1:" + fmt.Sprintf("%v", int32(internalPort))} containers[0].Env[1].Value = fmt.Sprintf("%v", internalPort) @@ -292,12 +292,26 @@ func (nl *metricsExporter) SetMetricsExporterAsDesired(ds *appsv1.DaemonSet, dev }, Args: args, VolumeMounts: volumeMounts, + Ports: []v1.ContainerPort{ + { + Name: "exporter-port", + Protocol: v1.ProtocolTCP, + ContainerPort: port, + }, + }, }) // Provide elevated privilege only when rbac-proxy is enabled serviceaccount = kubeRbacSAName } else { containers[0].Env[1].Value = fmt.Sprintf("%v", port) + containers[0].Ports = []v1.ContainerPort{ + { + Name: "exporter-port", + Protocol: v1.ProtocolTCP, + ContainerPort: port, + }, + } } gracePeriod := int64(1) diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go index 959bf39f..e745f6c3 100644 --- a/internal/nodelabeller/nodelabeller.go +++ b/internal/nodelabeller/nodelabeller.go @@ -52,6 +52,8 @@ const ( defaultNodeLabellerImage = "rocm/k8s-device-plugin:labeller-latest" defaultUbiNodeLabellerImage = "rocm/k8s-node-labeller:rhubi-latest" defaultInitContainerImage = "busybox:1.36" + defaultBlacklistFileName = "blacklist-amdgpu.conf" + openShiftBlacklistFileName = "blacklist-amdgpu-by-operator.conf" ) //go:generate mockgen -source=nodelabeller.go -package=nodelabeller -destination=mock_nodelabeller.go NodeLabeller @@ -129,15 +131,19 @@ func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig }, } - var initContainerCommand []string + blackListFileName := defaultBlacklistFileName + if nl.isOpenShift { + blackListFileName = openShiftBlacklistFileName + } + var initContainerCommand []string if devConfig.Spec.Driver.Blacklist != nil && *devConfig.Spec.Driver.Blacklist { // if users want to apply the blacklist, init container will add the amdgpu to the blacklist - initContainerCommand = []string{"sh", "-c", "echo \"# added by gpu operator \nblacklist amdgpu\" > /host-etc/modprobe.d/blacklist-amdgpu.conf; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done"} + initContainerCommand = []string{"sh", "-c", fmt.Sprintf("echo \"# added by gpu operator \nblacklist amdgpu\" > /host-etc/modprobe.d/%v; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done", blackListFileName)} } else { // if users disabled the KMM driver, or disabled the blacklist // init container will remove any hanging amdgpu blacklist entry from the list - initContainerCommand = []string{"sh", "-c", "rm -f /host-etc/modprobe.d/blacklist-amdgpu.conf; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done"} + initContainerCommand = []string{"sh", "-c", fmt.Sprintf("rm -f /host-etc/modprobe.d/%v; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done", blackListFileName)} } initContainerImage := defaultInitContainerImage @@ -169,7 +175,7 @@ func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig InitContainers: initContainers, Containers: []v1.Container{ { - Args: []string{"-c", "./k8s-node-labeller -vram -cu-count -simd-count -device-id -family -product-name -driver-version"}, + Args: []string{"-c", "./k8s-node-labeller -vram -cu-count -simd-count -device-id -family -product-name -driver-version -compute-memory-partition -compute-partitioning-supported -memory-partitioning-supported"}, Command: []string{"sh"}, Env: []v1.EnvVar{ { diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile index 59e84fda..a40f740b 100644 --- a/internal/utils_container/Dockerfile +++ b/internal/utils_container/Dockerfile @@ -1,31 +1,9 @@ -# Base image -FROM alpine:3.20.3 +FROM registry.access.redhat.com/ubi9/ubi-minimal:9.3 -# Install build dependencies -RUN apk add --no-cache \ - bash \ - build-base \ - automake \ - autoconf \ - libtool \ - pkgconfig \ - gettext-dev \ - bison \ - wget \ - tar \ - flex \ - linux-headers - -# Set working directory -WORKDIR /tmp - -RUN wget https://github.com/util-linux/util-linux/archive/v2.40.tar.gz && tar -xzf v2.40.tar.gz - -# Build and install nsenter only -WORKDIR /tmp/util-linux-2.40 -RUN ./autogen.sh && \ - ./configure --disable-all-programs --enable-nsenter && \ - make nsenter && \ - cp nsenter /nsenter +# Install nsenter from util-linux package +RUN microdnf install -y util-linux && \ + cp /usr/bin/nsenter /nsenter && \ + microdnf clean all +# Set entrypoint to nsenter ENTRYPOINT ["/nsenter"]