Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e84675e
[DOC] Add note that RVS test isn't compatible with partitioned GPU yet
yansun1996 Mar 28, 2025
60959c5
Address comments
yansun1996 Mar 31, 2025
7a50f27
BootID support for Reboot during Driver Upgrade
sriram-30 Mar 28, 2025
b140f1f
Device Plugin Usage documentation from GPU Operator
sriram-30 Mar 26, 2025
1a99bba
Optimize the docs and filename for blacklist function
yansun1996 Mar 26, 2025
042ba48
Rhubi based utils container
sriram-30 Apr 2, 2025
027cb95
use ubi minimal image for smaller size
sriram-30 Apr 2, 2025
51e8a3e
Push OLM changes for certification on OperatorHub
yansun1996 Apr 2, 2025
6490f63
New doc additions to metric and test runner section (#112)
im-AbhiP Apr 4, 2025
11df42c
Revert "New doc additions to metric and test runner section (#112)" (…
im-AbhiP Apr 4, 2025
174fee6
Reboot Loop issue if control node needs to go down for driver upgrade
sriram-30 Apr 3, 2025
6433730
Add warning to describe the known GPU scheduling issue for pre-start …
yansun1996 Apr 6, 2025
1ad40e3
Address comment
yansun1996 Apr 7, 2025
1773fc9
Doc on known limitation
sriram-30 Apr 8, 2025
81090a4
Add note for blacklisting amdgpu on OpenShift cluster in full example
yansun1996 Apr 8, 2025
36c6e96
Expose ContainerPort in Metrics Exporter Pod (#534)
bhatnitish Apr 7, 2025
53d34c0
Change default cpu/memory resource limits for Controller Manager
bhatnitish Apr 8, 2025
a7570fc
Updated ReadTheDocs conf to support copy code block button
farshadghodsian Apr 9, 2025
039ce94
Evict pods consuming partition resource types
sriram-30 Apr 9, 2025
a636a9d
[DOC] Add note that updating driver image repo is not supported
yansun1996 Apr 10, 2025
50eac07
Handle auto driver upgrade on OpenShift when KMM self-delete the NMC
sriram-30 Apr 11, 2025
0efcd89
MaxParallel constraint with MaxUnavailable
sriram-30 Apr 9, 2025
a4c4cc9
Release note doc
sriram-30 Apr 11, 2025
f2acfb1
Node labeller flags for partition related labels[DO NOT MERGE]
sriram-30 Apr 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ type DriverSpec struct {
// +kubebuilder:default=true
Enable *bool `json:"enable,omitempty"`

// blacklist amdgpu drivers on the host
// blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
// Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
// Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"}
Blacklist *bool `json:"blacklist,omitempty"`

Expand All @@ -115,6 +117,7 @@ type DriverSpec struct {
// for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
// image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
// example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
// NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"}
// +optional
// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
Expand Down Expand Up @@ -595,6 +598,7 @@ type ModuleStatus struct {
LastTransitionTime string `json:"lastTransitionTime,omitempty"`
Status UpgradeState `json:"status,omitempty"`
UpgradeStartTime string `json:"upgradeStartTime,omitempty"`
BootId string `json:"bootId,omitempty"`
}

// DeviceConfigStatus defines the observed state of Module.
Expand Down
58 changes: 47 additions & 11 deletions bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,30 @@ metadata:
}
}
]
capabilities: Basic Install
createdAt: "2025-03-25T06:19:27Z"
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.2.0
createdAt: "2025-04-10T00:25:51Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest
features.operators.openshift.io/disconnected: "true"
features.operators.openshift.io/fips-compliant: "false"
features.operators.openshift.io/proxy-aware: "true"
features.operators.openshift.io/tls-profiles: "false"
features.operators.openshift.io/token-auth-aws: "false"
features.operators.openshift.io/token-auth-azure: "false"
features.operators.openshift.io/token-auth-gcp: "false"
metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
operatorframework.io/cluster-monitoring: "true"
operatorframework.io/suggested-namespace: openshift-amd-gpu
operators.openshift.io/valid-subscription: '[]'
operators.operatorframework.io/builder: operator-sdk-v1.32.0
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
repository: https://github.com/ROCm/gpu-operator
support: Advanced Micro Devices, Inc.
name: amd-gpu-operator.v1.2.0
namespace: placeholder
spec:
Expand Down Expand Up @@ -229,7 +247,10 @@ spec:
path: driver.amdgpuInstallerRepoURL
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL
- description: blacklist amdgpu drivers on the host
- description: blacklist amdgpu drivers on the host. Node reboot is required
to apply the baclklist on the worker nodes. Not working for OpenShift cluster.
OpenShift users please use the Machine Config Operator (MCO) resource to
configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
displayName: BlacklistDrivers
path: driver.blacklist
x-descriptors:
Expand All @@ -241,13 +262,15 @@ spec:
path: driver.enable
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:enable
- description: defines image that includes drivers and firmware blobs, don't
- description: 'defines image that includes drivers and firmware blobs, don''t
include tag since it will be fully managed by operator for vanilla k8s the
default value is image-registry:5000/$MOD_NAMESPACE/amdgpu_kmod for OpenShift
the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel
version>-<driver version> example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2
and ubuntu-22.04-5.15.0-94-generic-6.1.3
and ubuntu-22.04-5.15.0-94-generic-6.1.3 NOTE: Updating the driver image
repository is not supported. Please delete the existing DeviceConfig and
create a new one with the updated image repository'
displayName: Image
path: driver.image
x-descriptors:
Expand Down Expand Up @@ -608,7 +631,7 @@ spec:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus
version: v1alpha1
description: |-
Operator responsible for deploying AMD GPU kernel drivers and device plugin
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
displayName: amd-gpu-operator
icon:
Expand Down Expand Up @@ -1112,11 +1135,24 @@ spec:
- supported: true
type: AllNamespaces
keywords:
- amd-gpu-operator
- AMD
- GPU
- AI
- Deep Learning
- Hardware
- Driver
- Monitoring
links:
- name: Amd Gpu Operator
url: https://amd-gpu-operator.domain
maturity: alpha
- name: AMD GPU Operator
url: https://github.com/ROCm/gpu-operator
maintainers:
- email: Yan.Sun3@amd.com
name: Yan Sun
- email: farshad.ghodsian@amd.com
name: Farshad Ghodsian
- email: shrey.ajmera@amd.com
name: Shrey Ajmera
maturity: stable
provider:
name: amd-gpu-operator
name: Advanced Micro Devices, Inc.
version: 1.2.0
8 changes: 7 additions & 1 deletion bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,10 @@ spec:
installer URL is https://repo.radeon.com/amdgpu-install by default
type: string
blacklist:
description: blacklist amdgpu drivers on the host
description: |-
blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
type: boolean
enable:
default: true
Expand All @@ -357,6 +360,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
Expand Down Expand Up @@ -928,6 +932,8 @@ spec:
description: ModuleStatus contains the status of driver module installed
by operator on the node
properties:
bootId:
type: string
containerImage:
type: string
kernelVersion:
Expand Down
8 changes: 7 additions & 1 deletion config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,10 @@ spec:
installer URL is https://repo.radeon.com/amdgpu-install by default
type: string
blacklist:
description: blacklist amdgpu drivers on the host
description: |-
blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
type: boolean
enable:
default: true
Expand All @@ -353,6 +356,7 @@ spec:
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageRegistrySecret:
Expand Down Expand Up @@ -924,6 +928,8 @@ spec:
description: ModuleStatus contains the status of driver module installed
by operator on the node
properties:
bootId:
type: string
containerImage:
type: string
kernelVersion:
Expand Down
56 changes: 46 additions & 10 deletions config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,27 @@ kind: ClusterServiceVersion
metadata:
annotations:
alm-examples: '[]'
capabilities: Basic Install
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.2.0
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest
features.operators.openshift.io/disconnected: "true"
features.operators.openshift.io/fips-compliant: "false"
features.operators.openshift.io/proxy-aware: "true"
features.operators.openshift.io/tls-profiles: "false"
features.operators.openshift.io/token-auth-aws: "false"
features.operators.openshift.io/token-auth-azure: "false"
features.operators.openshift.io/token-auth-gcp: "false"
metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
operatorframework.io/cluster-monitoring: "true"
operatorframework.io/suggested-namespace: openshift-amd-gpu
operators.openshift.io/valid-subscription: '[]'
repository: https://github.com/ROCm/gpu-operator
support: Advanced Micro Devices, Inc.
name: amd-gpu-operator.v0.0.0
namespace: placeholder
spec:
Expand Down Expand Up @@ -200,7 +218,10 @@ spec:
path: driver.amdgpuInstallerRepoURL
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL
- description: blacklist amdgpu drivers on the host
- description: blacklist amdgpu drivers on the host. Node reboot is required
to apply the baclklist on the worker nodes. Not working for OpenShift cluster.
OpenShift users please use the Machine Config Operator (MCO) resource to
configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
displayName: BlacklistDrivers
path: driver.blacklist
x-descriptors:
Expand All @@ -212,13 +233,15 @@ spec:
path: driver.enable
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:enable
- description: defines image that includes drivers and firmware blobs, don't
- description: 'defines image that includes drivers and firmware blobs, don''t
include tag since it will be fully managed by operator for vanilla k8s the
default value is image-registry:5000/$MOD_NAMESPACE/amdgpu_kmod for OpenShift
the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel
version>-<driver version> example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2
and ubuntu-22.04-5.15.0-94-generic-6.1.3
and ubuntu-22.04-5.15.0-94-generic-6.1.3 NOTE: Updating the driver image
repository is not supported. Please delete the existing DeviceConfig and
create a new one with the updated image repository'
displayName: Image
path: driver.image
x-descriptors:
Expand Down Expand Up @@ -579,7 +602,7 @@ spec:
- urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus
version: v1alpha1
description: |-
Operator responsible for deploying AMD GPU kernel drivers and device plugin
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
displayName: amd-gpu-operator
icon:
Expand All @@ -599,11 +622,24 @@ spec:
- supported: true
type: AllNamespaces
keywords:
- amd-gpu-operator
- AMD
- GPU
- AI
- Deep Learning
- Hardware
- Driver
- Monitoring
links:
- name: Amd Gpu Operator
url: https://amd-gpu-operator.domain
maturity: alpha
- name: AMD GPU Operator
url: https://github.com/ROCm/gpu-operator
maintainers:
- email: Yan.Sun3@amd.com
name: Yan Sun
- email: farshad.ghodsian@amd.com
name: Farshad Ghodsian
- email: shrey.ajmera@amd.com
name: Shrey Ajmera
maturity: stable
provider:
name: amd-gpu-operator
name: Advanced Micro Devices, Inc.
version: 0.0.0
16 changes: 11 additions & 5 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
"""Configuration file for the Sphinx documentation builder."""
import os

html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "instinct.docs.amd.com")
html_context = {}
if os.environ.get("READTHEDOCS", "") == "True":
html_context["READTHEDOCS"] = True
external_projects_local_file = "projects.yaml"
external_projects_remote_repository = ""
external_projects = ["amd-gpu-operator"]
external_projects_current_project = "amd-gpu-operator"

project = "AMD Instinct Documentation"
project = "AMD GPU Operator"
version = "1.2.0"
release = version
html_title = f"AMD GPU Operator {version}"
html_title = f"{project} {version}"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."

# Required settings
html_theme = "rocm_docs_theme"
html_theme_options = {
"flavor": "instinct"
"flavor": "instinct",
"link_main_doc": True,
# Add any additional theme options here
}
extensions = ["rocm_docs"]

# Table of contents
external_toc_path = "./sphinx/_toc.yml"

exclude_patterns = ['.venv']
exclude_patterns = ['.venv']
Loading