diff --git a/test/extended/include.go b/test/extended/include.go index a5e6b7a288a2..32186244d0e9 100644 --- a/test/extended/include.go +++ b/test/extended/include.go @@ -39,6 +39,7 @@ import ( _ "github.com/openshift/origin/test/extended/machines" _ "github.com/openshift/origin/test/extended/networking" _ "github.com/openshift/origin/test/extended/node" + _ "github.com/openshift/origin/test/extended/node/dra/nvidia" _ "github.com/openshift/origin/test/extended/node/node_e2e" _ "github.com/openshift/origin/test/extended/node_tuning" _ "github.com/openshift/origin/test/extended/oauth" diff --git a/test/extended/node/dra/OWNERS b/test/extended/node/dra/OWNERS new file mode 100644 index 000000000000..07217dd1abb3 --- /dev/null +++ b/test/extended/node/dra/OWNERS @@ -0,0 +1,17 @@ +approvers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +reviewers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +labels: + - sig/scheduling + - area/dra diff --git a/test/extended/node/dra/nvidia/OWNERS b/test/extended/node/dra/nvidia/OWNERS new file mode 100644 index 000000000000..07217dd1abb3 --- /dev/null +++ b/test/extended/node/dra/nvidia/OWNERS @@ -0,0 +1,17 @@ +approvers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +reviewers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +labels: + - sig/scheduling + - area/dra diff --git a/test/extended/node/dra/nvidia/README.md b/test/extended/node/dra/nvidia/README.md new file mode 100644 index 000000000000..40589d575313 --- /dev/null +++ b/test/extended/node/dra/nvidia/README.md @@ -0,0 +1,1083 @@ +# NVIDIA DRA Extended Tests for OpenShift + +This directory contains extended tests for NVIDIA Dynamic Resource Allocation (DRA) functionality on OpenShift clusters with GPU nodes. + +## Overview + +These tests validate: +- NVIDIA DRA driver installation and lifecycle +- Single GPU allocation via ResourceClaims +- Multi-GPU workload allocation +- Pod lifecycle and resource cleanup +- GPU device accessibility in pods + +## Enabling DRA on OpenShift - Quick Reference + +This section provides a concise guide for enabling Dynamic Resource Allocation (DRA) for NVIDIA GPUs on OpenShift. This information is useful for documentation teams and administrators. + +### Prerequisites + +1. **OpenShift 4.21+** with Kubernetes 1.34.2+ (DRA GA support) +2. **NVIDIA GPU Operator** installed via OLM (Operator Lifecycle Manager) + - Install from OperatorHub in OpenShift Console + - **Critical**: Enable CDI (Container Device Interface) in ClusterPolicy +3. **GPU-enabled worker nodes** (e.g., AWS g4dn.xlarge with Tesla T4) + +### Installation Steps Summary + +1. **Install GPU Operator** (via OpenShift OperatorHub) +2. **Enable CDI** in GPU Operator ClusterPolicy: + ```bash + oc patch clusterpolicy gpu-cluster-policy --type=merge -p ' + spec: + cdi: + enabled: true + ' + ``` +3. **Label GPU nodes** for DRA: + ```bash + oc label nodes -l nvidia.com/gpu.present=true nvidia.com/dra-kubelet-plugin=true + ``` +4. **Install DRA Driver** with minimal configuration: + ```bash + helm install nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --namespace nvidia-dra-driver-gpu --create-namespace \ + --set nvidiaDriverRoot=/run/nvidia/driver \ + --set gpuResourcesEnabledOverride=true \ + --set image.pullPolicy=IfNotPresent \ + --set-string kubeletPlugin.nodeSelector.nvidia\.com/dra-kubelet-plugin=true \ + --set controller.tolerations[0].key=node-role.kubernetes.io/master \ + --set controller.tolerations[0].operator=Exists \ + --set controller.tolerations[0].effect=NoSchedule \ + --set controller.tolerations[1].key=node-role.kubernetes.io/control-plane \ + --set controller.tolerations[1].operator=Exists \ + --set controller.tolerations[1].effect=NoSchedule + ``` + +### Key Configuration Parameters + +| Parameter | Value | Why It's Required | +|-----------|-------|-------------------| +| `nvidiaDriverRoot` | `/run/nvidia/driver` | GPU Operator installs drivers here on OpenShift (not `/`) | +| `gpuResourcesEnabledOverride` | `true` | Enables DRA-based GPU allocation (vs. traditional device plugin) | +| `kubeletPlugin.nodeSelector` | `nvidia.com/dra-kubelet-plugin=true` | Ensures DRA components only run on labeled GPU nodes | +| `image.pullPolicy` | `IfNotPresent` | Improves performance by caching images | + +### Using DRA in Workloads + +Once DRA is enabled, use `ResourceClaim` and `DeviceClass` resources instead of traditional `nvidia.com/gpu` resource requests: + +```yaml +# DeviceClass (cluster-scoped) +apiVersion: resource.k8s.io/v1 +kind: DeviceClass +metadata: + name: nvidia-gpu +spec: + selectors: + - cel: + expression: device.driver == "gpu.nvidia.com" +--- +# ResourceClaim (namespace-scoped) +apiVersion: resource.k8s.io/v1 +kind: ResourceClaim +metadata: + name: my-gpu-claim +spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: nvidia-gpu + count: 1 +--- +# Pod using the claim +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: cuda-app + image: nvcr.io/nvidia/cuda:12.0.0-base-ubuntu22.04 + command: ["nvidia-smi"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimName: my-gpu-claim +``` + +### Troubleshooting Common Issues + +| Issue | Cause | Solution | +|-------|-------|----------| +| DRA driver pods stuck at `Init:0/1` | Wrong `nvidiaDriverRoot` | Set to `/run/nvidia/driver` (not `/`) | +| CDI device not injected | CDI disabled in GPU Operator | Enable `cdi.enabled=true` in ClusterPolicy | +| Kubelet plugin not scheduled | Nodes not labeled | Label GPU nodes with `nvidia.com/dra-kubelet-plugin=true` | + +For complete documentation, see sections below. + +## ⚠️ Important: Version Matching Requirement + +**CRITICAL**: The `openshift-tests` binary version MUST match the cluster's release image version. This is a design requirement of the OpenShift test framework. + +### Why Version Matching is Required + +The OpenShift test framework has a two-layer architecture: +1. **Local binary**: Your built `openshift-tests` binary +2. **Cluster release image**: The version of OpenShift running on your cluster + +When tests run, the framework attempts to extract component-specific test binaries from the cluster's release image. If versions don't match, you'll see errors like: + +``` +error: couldn't retrieve test suites: failed to extract test binaries +note the version of origin needs to match the version of the cluster under test +``` + +### How to Match Versions + +#### Step 1: Find Your Cluster's Release Commit + +```bash +# Set your kubeconfig +export KUBECONFIG=/path/to/your/kubeconfig + +# Get the cluster version +oc get clusterversion version -o jsonpath='{.status.desired.version}' +# Example output: 4.21.0 + +# Get the exact origin commit used for this release +oc adm release info $(oc get clusterversion version -o jsonpath='{.status.desired.image}') \ + --commits | grep "^origin" + +# Example output: +# origin https://github.com/openshift/origin 1d23a96bb921ad1ceffaaed8bf295d26626f87d5 +``` + +#### Step 2: Checkout the Matching Commit + +```bash +cd /path/to/origin + +# Checkout the cluster's commit (use the commit from Step 1) +git checkout 1d23a96bb921ad1ceffaaed8bf295d26626f87d5 + +# Create a working branch for your NVIDIA DRA tests +git checkout -b nvidia-dra-ocp-4.21.0 + +# Now add your NVIDIA DRA test code to this branch +# (cherry-pick commits, copy files, or apply patches as needed) +``` + +#### Step 3: Verify Version Match + +After building, verify the versions match: + +```bash +# Build the test binary +make WHAT=cmd/openshift-tests + +# Check binary version +./openshift-tests version 2>&1 | grep "openshift-tests" +# Example: openshift-tests v4.1.0-10527-g1d23a96 + +# The commit hash (g1d23a96) should match your cluster's commit +``` + +### Alternative: Using run-test Command + +The `run-test` command bypasses the release image extraction and runs tests directly from your local binary: + +```bash +# This works even with version mismatch +./openshift-tests run-test -n '[sig-scheduling] NVIDIA DRA Basic GPU Allocation should allocate single GPU to pod via DRA [Suite:openshift/conformance/parallel]' +``` + +## Prerequisites + +### Required Cluster Setup (Before Running Tests) + +**CRITICAL REQUIREMENTS:** +1. ✅ NVIDIA GPU Operator must be pre-installed (tests will FAIL if not present) +2. ✅ GPU Operator ClusterPolicy must have `cdi.enabled=true` (REQUIRED for DRA) +3. ✅ Cluster must have GPU-enabled worker nodes + +### GPU Operator Installation + +**Install the NVIDIA GPU Operator following the official documentation:** + +📖 **[NVIDIA GPU Operator on OpenShift Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html)** + +The official guide covers: +- Installing GPU Operator via OLM (Red Hat OperatorHub) +- Creating and configuring the ClusterPolicy +- Verifying the installation +- Troubleshooting common issues + +### DRA-Specific Configuration Requirements + +After installing the GPU Operator, ensure the ClusterPolicy has the following settings for DRA to work: + +**CRITICAL**: Add or verify these settings in your ClusterPolicy: + +```bash +oc patch clusterpolicy gpu-cluster-policy --type=merge -p ' +spec: + operator: + defaultRuntime: crio + cdi: + enabled: true + default: false +' +``` + +**Required Settings Explained:** +- `operator.defaultRuntime: crio` - Required for OpenShift (uses CRI-O, not containerd) +- `cdi.enabled: true` - **CRITICAL** - Enables Container Device Interface required for DRA +- `cdi.default: false` - Don't make CDI the default device injection method + +### Verification + +After installing GPU Operator and configuring the ClusterPolicy: + +```bash +# 1. Verify ClusterPolicy is ready +oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.state}' +# Expected: "ready" + +# 2. Verify CDI is enabled (CRITICAL for DRA) +oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.spec.cdi.enabled}' +# Expected: "true" + +# 3. Verify runtime is set to crio +oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.spec.operator.defaultRuntime}' +# Expected: "crio" + +# 4. Check GPU Operator pods are running +oc get pods -n nvidia-gpu-operator +# All pods should be in Running state + +# 5. Check GPU nodes are labeled +oc get nodes -l nvidia.com/gpu.present=true +# Should list at least one GPU node +``` + +**If CDI is not enabled**, patch the ClusterPolicy: + +```bash +oc patch clusterpolicy gpu-cluster-policy --type=merge -p ' +spec: + cdi: + enabled: true + default: false +' + +# Wait for container toolkit to restart +oc rollout status daemonset/nvidia-container-toolkit-daemonset -n nvidia-gpu-operator +``` + +### DRA Driver Versioning + +**Important:** Tests install the **latest version** of the NVIDIA DRA Driver from the Helm chart repository. This ensures: +- Early detection of compatibility issues with new DRA driver releases +- Testing against current upstream development +- Validation that latest driver works with cluster's GPU Operator version + +If you need a specific DRA driver version, install it manually before running tests, and the test framework will detect and use the existing installation. + +### Automatically Installed by Tests + +The tests will **automatically install** the following if not already present: +- GPU node labeling with `nvidia.com/dra-kubelet-plugin=true` +- NVIDIA DRA Driver (**latest version** from Helm chart) +- All required SCC permissions for DRA driver +- Helm repository configuration + +The test framework detects existing DRA driver installations and skips if already running. + +### Required Before Running Tests + +1. **OpenShift cluster** with GPU-enabled worker nodes (OCP 4.21+) + - DRA support requires OpenShift 4.21 or later + - Tested on OCP 4.21.0 with Kubernetes 1.34.2 +2. **Helm 3** installed and available in PATH +3. **GPU hardware** present on worker nodes + - Tested with NVIDIA Tesla T4 (g4dn.xlarge on AWS) +4. **Cluster-admin** access for test execution +5. **Matching origin checkout** (see Version Matching section above) + +## Test Structure + +``` +test/extended/node/dra/nvidia/ +├── nvidia_dra.go # Main test suite (Ginkgo) - extended test format +├── prerequisites_installer.go # Automated prerequisite installation +├── driver_installer.go # Legacy DRA driver helpers (compatibility) +├── gpu_validator.go # GPU validation utilities +├── resource_builder.go # DRA resource builders (DeviceClass, ResourceClaim, Pod) +├── fixtures/ # YAML test fixtures +│ ├── deviceclass-nvidia.yaml +│ ├── resourceclaim-single-gpu.yaml +│ ├── resourceclaim-multi-gpu.yaml +│ ├── pod-single-gpu.yaml +│ └── pod-multi-gpu.yaml +├── standalone_test.sh # Standalone validation script +├── cleanup.sh # Cleanup utility +└── README.md # This file +``` + +## TL;DR - Quick Command Reference + +For users who already have their cluster set up with GPU Operator and want to run tests immediately: + +```bash +# Build test binary (ensure you've matched your origin checkout to cluster version) +cd /path/to/origin +make WHAT=cmd/openshift-tests + +# Set kubeconfig +export KUBECONFIG=/path/to/kubeconfig + +# Run ALL NVIDIA DRA tests in one command +./openshift-tests run --dry-run all 2>&1 | \ + grep "NVIDIA DRA" | \ + ./openshift-tests run -f - + +# Alternative: Run specific test +./openshift-tests run-test \ + -n '[sig-scheduling] NVIDIA DRA Basic GPU Allocation should allocate single GPU to pod via DRA [Suite:openshift/conformance/parallel]' +``` + +**Prerequisites**: GPU Operator must be installed with CDI enabled. See full documentation below for details. + +--- + +## Quick Start - Running Tests via openshift-tests + +### Option 1: Fully Automated (Recommended) + +```bash +# 1. Match your origin checkout to cluster version (see Version Matching section above) +cd /path/to/origin +git checkout +git checkout -b nvidia-dra-ocp- + +# 2. Ensure NVIDIA DRA test code is present in test/extended/node/dra/nvidia/ + +# 3. Build test binary +make WHAT=cmd/openshift-tests + +# 4. Set kubeconfig +export KUBECONFIG=/path/to/kubeconfig + +# 5. Run all NVIDIA DRA tests (single command) +./openshift-tests run --dry-run all 2>&1 | \ + grep "NVIDIA DRA" | \ + ./openshift-tests run -f - + +# OR run tests individually: +./openshift-tests run-test \ + -n '[sig-scheduling] NVIDIA DRA Basic GPU Allocation should allocate single GPU to pod via DRA [Suite:openshift/conformance/parallel]' + +./openshift-tests run-test \ + -n '[sig-scheduling] NVIDIA DRA Basic GPU Allocation should handle pod deletion and resource cleanup [Suite:openshift/conformance/parallel]' + +./openshift-tests run-test \ + -n '[sig-scheduling] NVIDIA DRA Multi-GPU Workloads should allocate multiple GPUs to single pod [Suite:openshift/conformance/parallel]' +``` + +**What happens automatically:** +1. Tests verify GPU Operator is already installed (FAILS if not present) +2. Tests wait for GPU Operator to be ready +3. Tests check if DRA Driver is already installed +4. If DRA Driver not found: + - GPU nodes are labeled with `nvidia.com/dra-kubelet-plugin=true` + - Helm repository is added (`nvidia` repo) + - DRA Driver (latest version) is installed via Helm with minimal configuration: + - `nvidiaDriverRoot=/run/nvidia/driver` - Points to GPU Operator driver location + - `gpuResourcesEnabledOverride=true` - Enables GPU allocation via DRA + - `image.pullPolicy=IfNotPresent` - Caches images for faster startup + - `kubeletPlugin.nodeSelector` - Targets labeled GPU nodes only + - `controller.tolerations` - Allows controller to schedule on tainted control-plane nodes + - SCC permissions are granted to DRA service accounts +5. Tests wait for DRA Driver to be ready (controller + kubelet plugin) +6. Tests execute against the configured GPU stack + +**Important:** GPU Operator MUST be pre-installed. See Prerequisites section above. + +**Re-running tests:** DRA Driver installation is automatically skipped if already installed (detection works with both Helm and manual installations). + +### Option 2: Run with Regex Filter + +Run all NVIDIA DRA tests that match a pattern: + +```bash +# Run all tests containing "NVIDIA DRA" using regex +./openshift-tests run all --run-until-failure -o /tmp/nvidia-dra-results \ + --include-success --junit-dir /tmp/nvidia-dra-junit 2>&1 | \ + grep -E '\[sig-scheduling\] NVIDIA DRA' +``` + +**Note**: The command above runs all matching tests but filters output. For cleaner execution, use the method in Option 1. + +### Option 3: List Available Tests + +```bash +# List all NVIDIA DRA tests without running them +./openshift-tests run --dry-run all 2>&1 | grep "NVIDIA DRA" + +# Example output: +# "[sig-scheduling] NVIDIA DRA Basic GPU Allocation should allocate single GPU to pod via DRA [Suite:openshift/conformance/parallel]" +# "[sig-scheduling] NVIDIA DRA Basic GPU Allocation should handle pod deletion and resource cleanup [Suite:openshift/conformance/parallel]" +# "[sig-scheduling] NVIDIA DRA Multi-GPU Workloads should allocate multiple GPUs to single pod [Suite:openshift/conformance/parallel]" + +# Count total NVIDIA DRA tests +./openshift-tests run --dry-run all 2>&1 | grep -c "NVIDIA DRA" +``` + +### Option 4: Run Standalone Validation (No Framework) + +For quick manual validation without the test framework: + +```bash +cd test/extended/node/dra/nvidia +export KUBECONFIG=/path/to/kubeconfig +./standalone_test.sh +``` + +**Features**: The standalone script now includes: +- **Automated DRA Driver installation** (via Helm if not already present) +- Detection of existing installations (via running pods, not just Helm releases) +- GPU Operator validation (fails if not present) +- Complete end-to-end validation (10 test scenarios) +- Detailed test result reporting +- Automatic cleanup on exit + +**Note**: Requires Helm 3 for automated installation. If Helm is not available, prerequisites must be pre-installed manually. + +## Standalone Test Suite + +The `standalone_test.sh` script provides a complete validation suite that mirrors the functionality of the openshift-tests framework tests, but can run independently without requiring the test framework build. + +### Features + +- **Automated Installation**: Automatically installs GPU Operator and DRA Driver via Helm if not present +- **Smart Detection**: Detects existing installations by checking for running pods (not just Helm releases) +- **Complete Validation**: Runs 10 comprehensive test scenarios +- **Detailed Reporting**: Color-coded output with pass/fail tracking +- **Automatic Cleanup**: Cleans up test resources on exit (via trap) + +### Test Coverage + +The standalone script runs the following tests: + +1. **Prerequisites Check** - Verifies Helm, GPU Operator, DRA Driver, GPU nodes, and ResourceSlices +2. **Namespace Creation** - Creates test namespace with privileged pod security level +3. **DeviceClass Creation** - Creates DeviceClass with CEL selector for `gpu.nvidia.com` +4. **ResourceClaim Creation** - Creates ResourceClaim using v1 API with `exactly` field +5. **Pod Creation** - Creates pod with ResourceClaim reference +6. **Pod Scheduling** - Waits for pod to reach Running/Succeeded state (2 minute timeout) +7. **GPU Access Validation** - Verifies nvidia-smi output shows accessible GPU +8. **ResourceClaim Allocation** - Validates ResourceClaim allocation status +9. **Lifecycle Testing** - Tests pod deletion and ResourceClaim persistence +10. **Multi-GPU Detection** - Checks if cluster has 2+ GPUs for multi-GPU testing + +### Running the Standalone Tests + +```bash +cd test/extended/node/dra/nvidia +export KUBECONFIG=/path/to/kubeconfig + +# Run with default results directory (/tmp/nvidia-dra-test-results) +./standalone_test.sh + +# Run with custom results directory +RESULTS_DIR=/my/results/path ./standalone_test.sh +``` + +### Example Output + +``` +====================================== +NVIDIA DRA Standalone Test Suite +====================================== +Results will be saved to: /tmp/nvidia-dra-test-results + +[INFO] Test 1: Check prerequisites (GPU Operator, DRA Driver, Helm) +[INFO] ✓ PASSED: Prerequisites verified (GPU Node: ip-10-0-10-28, ResourceSlices: 2) + +[INFO] Test 2: Create test namespace: nvidia-dra-e2e-test +[INFO] ✓ PASSED: Test namespace created with privileged security level + +[INFO] Test 3: Create DeviceClass: nvidia-gpu-test-1738672800 +[INFO] ✓ PASSED: DeviceClass created + +... + +====================================== +Test Results Summary +====================================== +Tests Run: 10 +Tests Passed: 9 +Tests Failed: 0 + +Result: ALL TESTS PASSED ✓ +``` + +### Prerequisites + +The standalone script requires: +- **Helm 3** - For automated installation (if prerequisites not already present) +- **Cluster-admin access** - For SCC permissions and namespace creation +- **GPU-enabled cluster** - OpenShift cluster with GPU worker nodes +- **Internet access** - To pull Helm charts and container images (if installing prerequisites) + +If Helm is not available, prerequisites must be pre-installed manually (see Manual Installation Reference section). + +## Test Scenarios + +### 1. Single GPU Allocation ✅ +- Creates DeviceClass with CEL selector +- Creates ResourceClaim requesting exactly 1 GPU +- Schedules pod with ResourceClaim +- Validates GPU accessibility via nvidia-smi +- Validates CDI device injection + +**Expected Result**: PASSED + +### 2. Resource Cleanup ✅ +- Creates pod with GPU ResourceClaim +- Deletes pod +- Verifies ResourceClaim persists after pod deletion +- Validates resource lifecycle management + +**Expected Result**: PASSED + +### 3. Multi-GPU Workloads ⚠️ +- Creates ResourceClaim requesting exactly 2 GPUs +- Schedules pod requiring multiple GPUs +- Validates all GPUs are accessible + +**Expected Result**: SKIPPED if cluster has fewer than 2 GPUs on a single node (expected behavior) + +## Manual Installation Reference + +The following steps document what the automated test code does. Use this as a reference for: +- Understanding the automated installation process +- Manually pre-installing prerequisites (optional) +- Debugging installation issues +- CI job configuration + +### Prerequisites for Manual Installation + +```bash +# Verify Helm 3 is installed +helm version + +# If not installed, install Helm 3 +curl -fsSL https://get.helm.sh/helm-v3.20.0-linux-amd64.tar.gz -o /tmp/helm.tar.gz +tar -zxvf /tmp/helm.tar.gz -C /tmp +sudo mv /tmp/linux-amd64/helm /usr/local/bin/helm +rm -rf /tmp/helm.tar.gz /tmp/linux-amd64 +``` + +### Step 1: Add NVIDIA Helm Repository + +```bash +# Add NVIDIA Helm repository +helm repo add nvidia https://nvidia.github.io/gpu-operator +helm repo update + +# Verify repository +helm search repo nvidia/gpu-operator --versions | head -5 +``` + +### Step 2: Verify GPU Operator Installation + +The GPU Operator should already be installed on the cluster. Verify it's running: + +```bash +# Check namespace exists +oc get namespace nvidia-gpu-operator + +# Check GPU Operator pods +oc get pods -n nvidia-gpu-operator + +# Expected output should include: +# - gpu-operator-xxxxx (Running) +# - nvidia-driver-daemonset-xxxxx (Running, 2/2) +# - nvidia-device-plugin-daemonset-xxxxx (Running) +# - nvidia-dcgm-exporter-xxxxx (Running) +# - gpu-feature-discovery-xxxxx (Running) + +# Verify GPU nodes are labeled by NFD +oc get nodes -l nvidia.com/gpu.present=true + +# Expected: At least one GPU node listed +``` + +If GPU Operator is not installed, install it following the Prerequisites section above. + +### Step 3: Verify GPU Node Labeling + +```bash +# NFD automatically labels GPU nodes - verify labels +oc get nodes -l nvidia.com/gpu.present=true + +# Expected output should show your GPU node(s): +# NAME STATUS ROLES AGE VERSION +# ip-10-0-10-28.ap-south-1.compute.internal Ready worker 1h v1.34.2 + +# Check GPU node labels in detail +oc describe node | grep nvidia.com + +# Expected labels (set by NFD): +# nvidia.com/gpu.present=true +# nvidia.com/gpu.product=Tesla-T4 +# nvidia.com/gpu.memory=15360 +# nvidia.com/cuda.driver.major=580 +# nvidia.com/cuda.driver.minor=105 +# nvidia.com/cuda.driver.rev=08 +``` + +### Step 4: Verify nvidia-smi Access + +```bash +# Get GPU node name +export GPU_NODE=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}') +echo "GPU Node: ${GPU_NODE}" + +# Test nvidia-smi on the node +oc debug node/${GPU_NODE} -- chroot /host /run/nvidia/driver/usr/bin/nvidia-smi + +# Expected output: +# +-----------------------------------------------------------------------------------------+ +# | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 | +# +-----------------------------------------+------------------------+----------------------+ +# | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +# ... +# | 0 Tesla T4 On | 00000000:00:1E.0 Off | 0 | +# ... +``` + +### Step 5: Label GPU Nodes for DRA + +Before installing the DRA driver, label all GPU nodes to indicate they should run the DRA kubelet plugin: + +```bash +# Label all GPU nodes for DRA kubelet plugin scheduling +for node in $(oc get nodes -l nvidia.com/gpu.present=true -o name); do + oc label $node nvidia.com/dra-kubelet-plugin=true --overwrite +done + +# Verify the label was applied +oc get nodes -l nvidia.com/dra-kubelet-plugin=true + +# Expected output: All GPU nodes should be listed +``` + +**Why is this label required?** + +The `nvidia.com/dra-kubelet-plugin=true` label serves two purposes: + +1. **Node Selection**: Ensures the DRA kubelet plugin DaemonSet only runs on GPU-enabled nodes +2. **Driver Manager Compatibility**: Works around a known issue where the NVIDIA Driver Manager doesn't properly evict DRA kubelet plugin pods during driver updates + +This label is recommended by NVIDIA's official documentation and is used in the kubelet plugin's node selector configuration. + +### Step 6: Install NVIDIA DRA Driver + +```bash +# Create namespace for DRA driver +oc create namespace nvidia-dra-driver-gpu + +# Grant SCC permissions (REQUIRED before Helm install on OpenShift) +oc adm policy add-scc-to-user privileged \ + -z nvidia-dra-driver-gpu-service-account-controller \ + -n nvidia-dra-driver-gpu + +oc adm policy add-scc-to-user privileged \ + -z nvidia-dra-driver-gpu-service-account-kubeletplugin \ + -n nvidia-dra-driver-gpu + +oc adm policy add-scc-to-user privileged \ + -z compute-domain-daemon-service-account \ + -n nvidia-dra-driver-gpu + +# Install NVIDIA DRA driver via Helm with minimal configuration +helm install nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --namespace nvidia-dra-driver-gpu \ + --set nvidiaDriverRoot=/run/nvidia/driver \ + --set gpuResourcesEnabledOverride=true \ + --set image.pullPolicy=IfNotPresent \ + --set-string kubeletPlugin.nodeSelector.nvidia\.com/dra-kubelet-plugin=true \ + --set controller.tolerations[0].key=node-role.kubernetes.io/master \ + --set controller.tolerations[0].operator=Exists \ + --set controller.tolerations[0].effect=NoSchedule \ + --set controller.tolerations[1].key=node-role.kubernetes.io/control-plane \ + --set controller.tolerations[1].operator=Exists \ + --set controller.tolerations[1].effect=NoSchedule \ + --wait \ + --timeout 5m +``` + +### DRA Driver Helm Parameters Explained + +The following table describes each Helm parameter used in the installation: + +| Parameter | Value | Required | Purpose | +|-----------|-------|----------|---------| +| `nvidiaDriverRoot` | `/run/nvidia/driver` | **Yes** | Specifies where the NVIDIA GPU Operator installs GPU drivers on OpenShift nodes. The DRA driver needs this path to access the NVIDIA driver binaries and libraries.

**Critical**: Must be `/run/nvidia/driver` for GPU Operator installations. Using `/` (the default) will cause the kubelet plugin to fail with `Init:0/1` errors. | +| `gpuResourcesEnabledOverride` | `true` | **Yes** | Enables GPU allocation support in the DRA driver. This tells the driver to publish GPU resources to Kubernetes and handle GPU allocation requests via DRA.

Without this, only ComputeDomain resources would be available (for multi-node GPU configurations). | +| `image.pullPolicy` | `IfNotPresent` | Recommended | Caches container images locally after first pull. Improves pod startup time on subsequent deployments.

Recommended by NVIDIA documentation for production deployments. | +| `kubeletPlugin.nodeSelector.nvidia.com/dra-kubelet-plugin` | `true` | Recommended | Restricts the DRA kubelet plugin DaemonSet to only run on nodes labeled with `nvidia.com/dra-kubelet-plugin=true`.

**Benefits**:
- Prevents kubelet plugin from attempting to run on non-GPU nodes
- Works around NVIDIA Driver Manager pod eviction issues
- Follows NVIDIA's recommended deployment pattern

**Important**: Use `--set-string` (not `--set`) to ensure the value `true` is treated as a string. Kubernetes `nodeSelector` requires string values, and using `--set` will cause Helm to interpret `true` as a boolean, resulting in installation errors. | +| `controller.tolerations[*]` | Specific tolerations | **Required** | Allows the controller pod to schedule on control-plane/master nodes that have `NoSchedule` taints.

**Why needed**: The controller (a Deployment) has a node affinity requiring it to run on control-plane nodes. These nodes are typically tainted with `node-role.kubernetes.io/master:NoSchedule` or `node-role.kubernetes.io/control-plane:NoSchedule` to prevent regular workloads from scheduling there. Without these tolerations, the controller pod will remain in `Pending` state.

**Tolerations set**:
- `[0]`: Tolerates `node-role.kubernetes.io/master:NoSchedule`
- `[1]`: Tolerates `node-role.kubernetes.io/control-plane:NoSchedule`

These cover both legacy (`master`) and current (`control-plane`) node role naming conventions. | + +### Optional Feature Gates (Not Set by Default) + +The DRA driver supports several feature gates that can be enabled for specific use cases. **These are intentionally NOT set in the basic installation** to keep configuration minimal: + +| Feature Gate | Default | When to Enable | +|-------------|---------|----------------| +| `featureGates.MPSSupport` | Platform default | Enable when testing NVIDIA Multi-Process Service (MPS) for GPU sharing | +| `featureGates.TimeSlicingSettings` | Platform default | Enable when testing GPU time-slicing for workload scheduling | +| `featureGates.ComputeDomainCliques` | Platform default | Enable for multi-node GPU configurations with NVLink (GB200/GB300 systems) | +| `featureGates.IMEXDaemonsWithDNSNames` | Platform default | Required if `ComputeDomainCliques` is enabled | + +**Best Practice**: Only enable feature gates when you need to test or use those specific features. This keeps the configuration simple and avoids potential conflicts. + +### Common Configuration Mistakes + +❌ **Wrong**: Using default driver root +```bash +--set nvidiaDriverRoot=/ +``` +**Result**: Kubelet plugin pods stuck at `Init:0/1` because they can't find GPU drivers + +✅ **Correct**: Specify GPU Operator driver location +```bash +--set nvidiaDriverRoot=/run/nvidia/driver +``` + +--- + +❌ **Wrong**: Not labeling GPU nodes +```bash +# Skipping node labeling step +helm install nvidia-dra-driver-gpu ... +``` +**Result**: Kubelet plugin may attempt to run on non-GPU nodes, or driver manager issues occur + +✅ **Correct**: Label nodes before installation +```bash +oc label node nvidia.com/dra-kubelet-plugin=true +helm install nvidia-dra-driver-gpu ... \ + --set-string kubeletPlugin.nodeSelector.nvidia\.com/dra-kubelet-plugin=true +``` + +--- + +❌ **Wrong**: Enabling feature gates unnecessarily +```bash +--set featureGates.MPSSupport=true \ +--set featureGates.TimeSlicingSettings=true \ +--set featureGates.ComputeDomainCliques=false +``` +**Result**: Adds complexity without benefit for basic GPU allocation testing + +✅ **Correct**: Minimal configuration for basic DRA +```bash +--set nvidiaDriverRoot=/run/nvidia/driver \ +--set gpuResourcesEnabledOverride=true +``` +Enable feature gates only when testing specific features + +### Step 7: Verify DRA Driver Installation + +```bash +# Check DRA driver pods +oc get pods -n nvidia-dra-driver-gpu + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# nvidia-dra-driver-gpu-controller-xxxxx 1/1 Running 0 2m +# nvidia-dra-driver-gpu-kubelet-plugin-xxxxx 2/2 Running 0 2m ← MUST be 2/2 + +# Wait for kubelet plugin to be ready +oc wait --for=condition=Ready pod \ + -l app.kubernetes.io/name=nvidia-dra-driver-gpu \ + -n nvidia-dra-driver-gpu --timeout=300s + +# Verify ResourceSlices are published +oc get resourceslices + +# Expected output (at least 2 slices per GPU node): +# NAME DRIVER POOL AGE +# ip-10-0-10-28-compute-domain.nvidia.com-xxxxx compute-domain.nvidia.com 2m +# ip-10-0-10-28-gpu.nvidia.com-xxxxx gpu.nvidia.com 2m + +# Inspect ResourceSlice details +oc get resourceslice -o json | \ + jq -r '.items[] | select(.spec.driver=="gpu.nvidia.com") | .spec.devices[0]' + +# Expected output shows GPU details: +# { +# "name": "gpu-0", +# "attributes": { +# "dra.nvidia.com/architecture": "Turing", +# "dra.nvidia.com/brand": "Tesla", +# "dra.nvidia.com/cuda-compute-capability": "7.5", +# "dra.nvidia.com/index": "0", +# "dra.nvidia.com/memory": "15360", +# "dra.nvidia.com/model": "Tesla-T4", +# "dra.nvidia.com/product": "Tesla-T4-SHARED" +# } +# } +``` + +### Step 8: Complete Verification Checklist + +```bash +# 1. GPU Operator is running +oc get pods -n nvidia-gpu-operator | grep -v Completed +# All pods should be Running, nvidia-driver-daemonset MUST be 2/2 + +# 2. DRA Driver is running +oc get pods -n nvidia-dra-driver-gpu +# Expected: +# - nvidia-dra-driver-gpu-controller-* : 1/1 Running +# - nvidia-dra-driver-gpu-kubelet-plugin-* : 2/2 Running + +# 3. ResourceSlices published +oc get resourceslices | wc -l +# Should be > 0 (typically 2 per GPU node) + +# 4. GPU nodes labeled by NFD +oc get nodes -l nvidia.com/gpu.present=true -o name +# Should list your GPU nodes + +# 5. nvidia-smi accessible +GPU_NODE=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}') +oc debug node/${GPU_NODE} -- chroot /host /run/nvidia/driver/usr/bin/nvidia-smi +# Should show GPU information + +# ✅ If all checks pass, your cluster is ready for NVIDIA DRA tests! +``` + +## Important Notes + +### DRA Driver Configuration + +The NVIDIA DRA Driver (installed automatically by tests) requires the correct `nvidiaDriverRoot` setting: + +```bash +# ✅ CORRECT - Points to where GPU Operator installs drivers +--set nvidiaDriverRoot=/run/nvidia/driver +``` + +This is automatically configured by the test framework. If you're manually installing the DRA driver, ensure this setting is correct. + +### GPU Operator CDI Requirement + +**CRITICAL**: The GPU Operator ClusterPolicy must have CDI enabled for DRA to work. + +Verify CDI is enabled: +```bash +oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.spec.cdi.enabled}' +# Expected: "true" +``` + +If not enabled, see the Prerequisites section above for instructions to patch the ClusterPolicy. + +## Cleanup + +### Option 1: Automated Cleanup (Recommended) + +Use the enhanced cleanup script that mirrors the test code's UninstallAll logic: + +```bash +cd test/extended/node/dra/nvidia +./cleanup.sh +``` + +**What it does:** +1. Uninstalls NVIDIA DRA Driver via Helm (with proper wait/timeout) +2. Removes SCC permissions (ClusterRoleBindings for service accounts) +3. Deletes `nvidia-dra-driver-gpu` namespace +4. Cleans up test resources (DeviceClasses, test namespaces) +5. Provides colored output for better visibility + +**Note:** GPU Operator is cluster infrastructure and is NOT removed by cleanup. + +**Features:** +- Matches the UninstallAll logic from prerequisites_installer.go +- Safe error handling (continues even if resources not found) +- Cleans up DRA driver Helm release and namespace +- Removes test artifacts (DeviceClasses, ResourceClaims in test namespaces) + +### Option 2: Manual Cleanup + +```bash +# Uninstall DRA Driver +helm uninstall nvidia-dra-driver-gpu -n nvidia-dra-driver-gpu --wait --timeout 5m +oc delete namespace nvidia-dra-driver-gpu + +# Remove SCC permissions +oc delete clusterrolebinding \ + nvidia-dra-privileged-nvidia-dra-driver-gpu-service-account-controller \ + nvidia-dra-privileged-nvidia-dra-driver-gpu-service-account-kubeletplugin \ + nvidia-dra-privileged-compute-domain-daemon-service-account +``` + +**Note:** GPU Operator is cluster infrastructure and is NOT removed by cleanup. ResourceSlices are cluster-scoped and will be cleaned up automatically when DRA driver is uninstalled. + +## CI Integration + +### Recommended CI Job Configuration + +```bash +#!/bin/bash +set -euo pipefail + +# 1. Set kubeconfig +export KUBECONFIG=/path/to/kubeconfig + +# 2. Match origin version to cluster (CRITICAL) +CLUSTER_COMMIT=$(oc adm release info $(oc get clusterversion version -o jsonpath='{.status.desired.image}') \ + --commits | grep "^origin" | awk '{print $NF}') +echo "Cluster origin commit: ${CLUSTER_COMMIT}" + +# Checkout matching commit and apply NVIDIA DRA tests +cd /path/to/origin +git checkout ${CLUSTER_COMMIT} +git checkout -b nvidia-dra-ci-${BUILD_ID} + +# Apply your NVIDIA DRA test code +# (copy test files, cherry-pick commits, or use other method) + +# 3. Build test binary +make WHAT=cmd/openshift-tests + +# 4. Run all NVIDIA DRA tests (single command - recommended for CI) +./openshift-tests run --dry-run all 2>&1 | \ + grep "NVIDIA DRA" | \ + ./openshift-tests run -f - \ + -o /logs/test-output.log \ + --junit-dir=/logs/junit + +# Alternative: Run tests individually with explicit names +# ./openshift-tests run-test \ +# -n '[sig-scheduling] NVIDIA DRA Basic GPU Allocation should allocate single GPU to pod via DRA [Suite:openshift/conformance/parallel]' \ +# -n '[sig-scheduling] NVIDIA DRA Basic GPU Allocation should handle pod deletion and resource cleanup [Suite:openshift/conformance/parallel]' \ +# -n '[sig-scheduling] NVIDIA DRA Multi-GPU Workloads should allocate multiple GPUs to single pod [Suite:openshift/conformance/parallel]' \ +# -o /logs/test-output.log \ +# --junit-dir=/logs/junit + +# 5. Exit with test status +exit $? +``` + +### CI Requirements Checklist + +- ✅ OpenShift cluster with GPU worker nodes (g4dn.xlarge or similar) +- ✅ GPU Operator pre-installed with CDI enabled (see Prerequisites section) +- ✅ Helm 3 installed in CI environment +- ✅ Cluster-admin kubeconfig available +- ✅ Internet access to pull Helm charts and container images +- ✅ Origin repository checkout matching cluster version +- ⚠️ First test run takes ~5-10 minutes (includes DRA Driver installation) +- ✅ Subsequent runs are faster (~2-5 minutes, DRA driver skipped if already installed) + +### Expected Test Results + +``` +Test 1: Single GPU Allocation ✅ PASSED (6-8 seconds) +Test 2: Pod deletion and resource cleanup ✅ PASSED (6-8 seconds) +Test 3: Multi-GPU workloads ⚠️ SKIPPED (only 1 GPU available) + +Total: 2 Passed, 0 Failed, 1 Skipped +``` + +## Troubleshooting + +### Issue 1: Tests fail with "GPU Operator not found" + +**Cause**: GPU Operator is not installed on the cluster. + +**Solution**: Install GPU Operator following the [NVIDIA GPU Operator on OpenShift Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html), then ensure CDI is enabled in the ClusterPolicy (see Prerequisites section). + +### Issue 2: Tests fail with CDI or device injection errors + +**Cause**: CDI is not enabled in the GPU Operator ClusterPolicy. + +**Solution**: +```bash +# Check if CDI is enabled +oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.spec.cdi.enabled}' + +# If not "true", patch the ClusterPolicy +oc patch clusterpolicy gpu-cluster-policy --type=merge -p ' +spec: + cdi: + enabled: true + default: false +' + +# Wait for container toolkit to restart +oc rollout status daemonset/nvidia-container-toolkit-daemonset -n nvidia-gpu-operator +``` + +### Issue 3: "version of origin needs to match the version of the cluster" + +**Cause**: Your local origin checkout doesn't match the cluster's release commit. + +**Solution**: Follow the "Version Matching Requirement" section above. + +### Issue 5: DRA driver kubelet plugin stuck at Init:0/1 + +**Cause**: DRA driver cannot find GPU drivers (usually already handled by test framework). + +**Solution**: This is automatically configured correctly by the test framework. If manually installing DRA driver, ensure `nvidiaDriverRoot=/run/nvidia/driver`. + +### Issue 6: ResourceSlices not appearing + +**Cause**: DRA driver not fully initialized or SCC permissions missing. + +**Solution**: +```bash +# 1. Check DRA driver logs +oc logs -n nvidia-dra-driver-gpu -l app.kubernetes.io/name=nvidia-dra-driver-gpu --all-containers + +# 2. Verify SCC permissions +oc describe scc privileged | grep nvidia-dra-driver-gpu + +# 3. Restart DRA driver if needed +oc delete pod -n nvidia-dra-driver-gpu -l app.kubernetes.io/name=nvidia-dra-driver-gpu +``` + +### Issue 7: Tests fail with PodSecurity violations + +**Cause**: Namespace not using privileged security level. + +**Solution**: The test code already uses `admissionapi.LevelPrivileged` in `nvidia_dra.go`. If you see this error, ensure your test code includes: + +```go +oc := exutil.NewCLIWithPodSecurityLevel("nvidia-dra", admissionapi.LevelPrivileged) +``` + +## References + +- **NVIDIA GPU Operator**: https://github.com/NVIDIA/gpu-operator +- **NVIDIA DRA Driver**: https://github.com/NVIDIA/k8s-dra-driver-gpu +- **Kubernetes DRA Documentation**: https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/ +- **OpenShift Extended Tests**: https://github.com/openshift/origin/tree/master/test/extended + +--- + +**Last Updated**: 2026-02-13 +**Test Framework Version**: openshift-tests v4.1.0-10528-g690b329 +**GPU Operator**: Pre-installed (see Prerequisites) +**DRA Driver**: Latest version (auto-installed by tests) +**Tested On**: OCP 4.21.0, Kubernetes 1.34.2, Tesla T4 diff --git a/test/extended/node/dra/nvidia/cleanup.sh b/test/extended/node/dra/nvidia/cleanup.sh new file mode 100755 index 000000000000..6428bb7d35d9 --- /dev/null +++ b/test/extended/node/dra/nvidia/cleanup.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# +# Cleanup script for NVIDIA DRA Driver +# Removes DRA Driver installed by tests (GPU Operator is cluster infrastructure and not removed) +# This script mirrors the UninstallAll logic from prerequisites_installer.go +# + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +function log_info() { + echo -e "${GREEN}[INFO]${NC} $*" +} + +function log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" +} + +function log_error() { + echo -e "${RED}[ERROR]${NC} $*" +} + +echo "========================================" +echo "NVIDIA GPU Stack Cleanup" +echo "========================================" +echo "" + +# Uninstall DRA Driver first (mirrors prerequisites_installer.go UninstallAll) +log_info "Uninstalling NVIDIA DRA Driver..." +if helm uninstall nvidia-dra-driver-gpu \ + --namespace nvidia-dra-driver-gpu \ + --wait \ + --timeout 5m 2>/dev/null; then + log_info "DRA Driver Helm release uninstalled" +else + log_warn "DRA Driver Helm release not found or already uninstalled" +fi + +# Clean up SCC permissions (ClusterRoleBindings) +log_info "Cleaning up SCC permissions..." +for crb in \ + nvidia-dra-privileged-nvidia-dra-driver-gpu-service-account-controller \ + nvidia-dra-privileged-nvidia-dra-driver-gpu-service-account-kubeletplugin \ + nvidia-dra-privileged-compute-domain-daemon-service-account; do + if oc delete clusterrolebinding "$crb" --ignore-not-found=true 2>/dev/null; then + log_info "Deleted ClusterRoleBinding: $crb" + fi +done + +# Delete DRA Driver namespace +if oc delete namespace nvidia-dra-driver-gpu --ignore-not-found=true 2>/dev/null; then + log_info "Deleted namespace: nvidia-dra-driver-gpu" +else + log_warn "Namespace nvidia-dra-driver-gpu not found" +fi + + +echo "" + +# Clean up test resources (DeviceClasses and test namespaces) +log_info "Cleaning up test resources..." + +# Delete any test DeviceClasses (these are cluster-scoped) +TEST_DEVICECLASSES=$(oc get deviceclass -o name 2>/dev/null | grep -E 'nvidia-gpu-test' || true) +if [ -n "$TEST_DEVICECLASSES" ]; then + log_info "Deleting test DeviceClasses..." + echo "$TEST_DEVICECLASSES" | xargs oc delete --ignore-not-found=true 2>/dev/null || true +fi + +# Delete any test namespaces +TEST_NAMESPACES=$(oc get namespaces -o name 2>/dev/null | grep -E 'nvidia-dra.*test|e2e.*nvidia' || true) +if [ -n "$TEST_NAMESPACES" ]; then + log_info "Deleting test namespaces..." + echo "$TEST_NAMESPACES" | xargs oc delete --wait=false --ignore-not-found=true 2>/dev/null || true +fi + +echo "" +echo "========================================" +echo "Cleanup Complete" +echo "========================================" +log_info "GPU Operator is cluster infrastructure and was not removed" +log_info "NFD Operator is cluster infrastructure and was not removed" +log_info "ResourceSlices will be cleaned up by the Kubernetes API server" diff --git a/test/extended/node/dra/nvidia/driver_installer.go b/test/extended/node/dra/nvidia/driver_installer.go new file mode 100644 index 000000000000..5ded5f5d29e4 --- /dev/null +++ b/test/extended/node/dra/nvidia/driver_installer.go @@ -0,0 +1,188 @@ +package nvidia + +import ( + "context" + "fmt" + "os/exec" + "strings" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" +) + +const ( + defaultDriverNamespace = "nvidia-dra-driver" + defaultHelmRelease = "nvidia-dra-driver" + defaultHelmChart = "oci://ghcr.io/nvidia/k8s-dra-driver-gpu/nvidia-dra-driver" + defaultDriverName = "gpu.nvidia.com" +) + +// DriverInstaller manages NVIDIA DRA driver lifecycle via Helm +type DriverInstaller struct { + client kubernetes.Interface + namespace string + helmRelease string + helmChart string + driverName string +} + +// NewDriverInstaller creates a new installer instance +func NewDriverInstaller(f *framework.Framework) *DriverInstaller { + return &DriverInstaller{ + client: f.ClientSet, + namespace: defaultDriverNamespace, + helmRelease: defaultHelmRelease, + helmChart: defaultHelmChart, + driverName: defaultDriverName, + } +} + +// Install installs the NVIDIA DRA driver using Helm +func (di *DriverInstaller) Install(ctx context.Context) error { + framework.Logf("Installing NVIDIA DRA driver via Helm") + + // Create namespace if it doesn't exist + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: di.namespace, + }, + } + _, err := di.client.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) + if err != nil && !strings.Contains(err.Error(), "already exists") { + return fmt.Errorf("failed to create namespace %s: %w", di.namespace, err) + } + framework.Logf("Namespace %s created or already exists", di.namespace) + + // Install driver via Helm + cmd := exec.CommandContext(ctx, "helm", "install", di.helmRelease, + di.helmChart, + "--namespace", di.namespace, + "--wait", + "--timeout", "5m") + + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to install NVIDIA DRA driver: %w\nOutput: %s", err, string(output)) + } + framework.Logf("Helm install output: %s", string(output)) + + return nil +} + +// Uninstall removes the NVIDIA DRA driver +func (di *DriverInstaller) Uninstall(ctx context.Context) error { + framework.Logf("Uninstalling NVIDIA DRA driver") + + // Uninstall via Helm + cmd := exec.CommandContext(ctx, "helm", "uninstall", di.helmRelease, + "--namespace", di.namespace, + "--wait", + "--timeout", "5m") + + output, err := cmd.CombinedOutput() + if err != nil && !strings.Contains(string(output), "not found") { + return fmt.Errorf("failed to uninstall NVIDIA DRA driver: %w\nOutput: %s", err, string(output)) + } + framework.Logf("Helm uninstall output: %s", string(output)) + + // Delete namespace + err = di.client.CoreV1().Namespaces().Delete(ctx, di.namespace, metav1.DeleteOptions{}) + if err != nil && !strings.Contains(err.Error(), "not found") { + return fmt.Errorf("failed to delete namespace %s: %w", di.namespace, err) + } + framework.Logf("Namespace %s deleted", di.namespace) + + return nil +} + +// WaitForReady waits for driver to be operational +func (di *DriverInstaller) WaitForReady(ctx context.Context, timeout time.Duration) error { + framework.Logf("Waiting for NVIDIA DRA driver to be ready (timeout: %v)", timeout) + + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + // Get DaemonSet + ds, err := di.client.AppsV1().DaemonSets(di.namespace).Get(ctx, di.helmRelease, metav1.GetOptions{}) + if err != nil { + framework.Logf("DaemonSet not found yet: %v", err) + return false, nil + } + + // Check if DaemonSet is ready + if !di.isDaemonSetReady(ds) { + framework.Logf("DaemonSet not ready yet: desired=%d, current=%d, ready=%d", + ds.Status.DesiredNumberScheduled, + ds.Status.CurrentNumberScheduled, + ds.Status.NumberReady) + return false, nil + } + + framework.Logf("DaemonSet is ready: %d/%d pods ready", + ds.Status.NumberReady, + ds.Status.DesiredNumberScheduled) + return true, nil + }) +} + +// isDaemonSetReady checks if DaemonSet is fully ready +func (di *DriverInstaller) isDaemonSetReady(ds *appsv1.DaemonSet) bool { + return ds.Status.DesiredNumberScheduled > 0 && + ds.Status.NumberReady == ds.Status.DesiredNumberScheduled && + ds.Status.NumberUnavailable == 0 +} + +// VerifyPluginRegistration checks if kubelet has registered the plugin +func (di *DriverInstaller) VerifyPluginRegistration(ctx context.Context, nodeName string) error { + framework.Logf("Verifying plugin registration on node %s", nodeName) + + // Get driver pod running on the node + podList, err := di.client.CoreV1().Pods(di.namespace).List(ctx, metav1.ListOptions{ + FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), + }) + if err != nil { + return fmt.Errorf("failed to list driver pods on node %s: %w", nodeName, err) + } + + if len(podList.Items) == 0 { + return fmt.Errorf("no driver pod found on node %s", nodeName) + } + + pod := podList.Items[0] + if pod.Status.Phase != corev1.PodRunning { + return fmt.Errorf("driver pod %s on node %s is not running (phase: %s)", pod.Name, nodeName, pod.Status.Phase) + } + + framework.Logf("Driver pod %s is running on node %s", pod.Name, nodeName) + return nil +} + +// GetInstalledVersion returns the version of installed driver +func (di *DriverInstaller) GetInstalledVersion(ctx context.Context) (string, error) { + cmd := exec.CommandContext(ctx, "helm", "list", + "--namespace", di.namespace, + "--filter", di.helmRelease, + "--output", "json") + + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get helm release version: %w\nOutput: %s", err, string(output)) + } + + // Parse JSON output to get version + // For simplicity, just return the raw output + return string(output), nil +} + +// GetDriverNamespace returns the namespace where the driver is installed +func (di *DriverInstaller) GetDriverNamespace() string { + return di.namespace +} + +// GetDriverName returns the driver name +func (di *DriverInstaller) GetDriverName() string { + return di.driverName +} diff --git a/test/extended/node/dra/nvidia/fixtures/OWNERS b/test/extended/node/dra/nvidia/fixtures/OWNERS new file mode 100644 index 000000000000..07217dd1abb3 --- /dev/null +++ b/test/extended/node/dra/nvidia/fixtures/OWNERS @@ -0,0 +1,17 @@ +approvers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +reviewers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +labels: + - sig/scheduling + - area/dra diff --git a/test/extended/node/dra/nvidia/fixtures/deviceclass-nvidia.yaml b/test/extended/node/dra/nvidia/fixtures/deviceclass-nvidia.yaml new file mode 100644 index 000000000000..5be7f17696d0 --- /dev/null +++ b/test/extended/node/dra/nvidia/fixtures/deviceclass-nvidia.yaml @@ -0,0 +1,8 @@ +apiVersion: resource.k8s.io/v1 +kind: DeviceClass +metadata: + name: nvidia-gpu +spec: + selectors: + - cel: + expression: device.driver == "gpu.nvidia.com" diff --git a/test/extended/node/dra/nvidia/fixtures/pod-multi-gpu.yaml b/test/extended/node/dra/nvidia/fixtures/pod-multi-gpu.yaml new file mode 100644 index 000000000000..378bf2b0ef6b --- /dev/null +++ b/test/extended/node/dra/nvidia/fixtures/pod-multi-gpu.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-multi-gpu-pod +spec: + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/cuda:12.0.0-base-ubuntu22.04 + command: ["nvidia-smi"] + resources: + claims: + - name: gpus + resourceClaims: + - name: gpus + resourceClaimName: test-multi-gpu-claim diff --git a/test/extended/node/dra/nvidia/fixtures/pod-single-gpu.yaml b/test/extended/node/dra/nvidia/fixtures/pod-single-gpu.yaml new file mode 100644 index 000000000000..3b47cce9f8f0 --- /dev/null +++ b/test/extended/node/dra/nvidia/fixtures/pod-single-gpu.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-gpu-pod +spec: + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/cuda:12.0.0-base-ubuntu22.04 + command: ["nvidia-smi"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimName: test-gpu-claim diff --git a/test/extended/node/dra/nvidia/fixtures/resourceclaim-multi-gpu.yaml b/test/extended/node/dra/nvidia/fixtures/resourceclaim-multi-gpu.yaml new file mode 100644 index 000000000000..140dd0d1efbd --- /dev/null +++ b/test/extended/node/dra/nvidia/fixtures/resourceclaim-multi-gpu.yaml @@ -0,0 +1,11 @@ +apiVersion: resource.k8s.io/v1 +kind: ResourceClaim +metadata: + name: test-multi-gpu-claim +spec: + devices: + requests: + - name: gpus + exactly: + deviceClassName: nvidia-gpu + count: 2 diff --git a/test/extended/node/dra/nvidia/fixtures/resourceclaim-single-gpu.yaml b/test/extended/node/dra/nvidia/fixtures/resourceclaim-single-gpu.yaml new file mode 100644 index 000000000000..bcf5b981f532 --- /dev/null +++ b/test/extended/node/dra/nvidia/fixtures/resourceclaim-single-gpu.yaml @@ -0,0 +1,11 @@ +apiVersion: resource.k8s.io/v1 +kind: ResourceClaim +metadata: + name: test-gpu-claim +spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: nvidia-gpu + count: 1 diff --git a/test/extended/node/dra/nvidia/gpu_validator.go b/test/extended/node/dra/nvidia/gpu_validator.go new file mode 100644 index 000000000000..6b617bedc155 --- /dev/null +++ b/test/extended/node/dra/nvidia/gpu_validator.go @@ -0,0 +1,341 @@ +package nvidia + +import ( + "context" + "fmt" + "strconv" + "strings" + + corev1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" +) + +const ( + gpuPresentLabel = "nvidia.com/gpu.present" +) + +// GPUValidator validates GPU allocation and accessibility +type GPUValidator struct { + client kubernetes.Interface + restConfig *rest.Config + framework *framework.Framework +} + +// NewGPUValidator creates a new validator instance +func NewGPUValidator(f *framework.Framework) *GPUValidator { + return &GPUValidator{ + client: f.ClientSet, + restConfig: f.ClientConfig(), + framework: f, + } +} + +// ValidateGPUInPod validates that GPU is accessible in the pod +func (gv *GPUValidator) ValidateGPUInPod(ctx context.Context, namespace, podName string, expectedGPUCount int) error { + framework.Logf("Validating GPU accessibility in pod %s/%s (expected %d GPUs)", namespace, podName, expectedGPUCount) + + // Get the pod + pod, err := gv.client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get pod %s/%s: %w", namespace, podName, err) + } + + // Exec nvidia-smi to verify GPU is accessible + nvidiaSmiCmd := []string{"nvidia-smi", "--query-gpu=index,name", "--format=csv,noheader"} + stdout, stderr, err := e2epod.ExecCommandInContainerWithFullOutput( + gv.framework, + podName, + pod.Spec.Containers[0].Name, + nvidiaSmiCmd..., + ) + output := stdout + stderr + if err != nil { + return fmt.Errorf("failed to execute nvidia-smi in pod %s/%s: %w\nOutput: %s", + namespace, podName, err, output) + } + + // Parse output to count GPUs + lines := strings.Split(strings.TrimSpace(output), "\n") + actualGPUCount := 0 + for _, line := range lines { + if strings.TrimSpace(line) != "" { + actualGPUCount++ + } + } + + if actualGPUCount != expectedGPUCount { + return fmt.Errorf("expected %d GPUs but found %d in pod %s/%s\nnvidia-smi output:\n%s", + expectedGPUCount, actualGPUCount, namespace, podName, output) + } + + framework.Logf("Successfully validated %d GPU(s) in pod %s/%s", actualGPUCount, namespace, podName) + + // Validate CUDA_VISIBLE_DEVICES environment variable + err = gv.validateCudaVisibleDevices(ctx, namespace, podName, expectedGPUCount) + if err != nil { + framework.Logf("Warning: CUDA_VISIBLE_DEVICES validation failed: %v", err) + // Don't fail the test for this, as it may not always be set + } + + return nil +} + +// validateCudaVisibleDevices checks the CUDA_VISIBLE_DEVICES environment variable +func (gv *GPUValidator) validateCudaVisibleDevices(ctx context.Context, namespace, podName string, expectedCount int) error { + pod, err := gv.client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get pod: %w", err) + } + + envCmd := []string{"sh", "-c", "echo $CUDA_VISIBLE_DEVICES"} + stdout, stderr, err := e2epod.ExecCommandInContainerWithFullOutput( + gv.framework, + podName, + pod.Spec.Containers[0].Name, + envCmd..., + ) + output := stdout + stderr + if err != nil { + return fmt.Errorf("failed to get CUDA_VISIBLE_DEVICES: %w", err) + } + + cudaDevices := strings.TrimSpace(output) + if cudaDevices == "" { + return fmt.Errorf("CUDA_VISIBLE_DEVICES is not set") + } + + framework.Logf("CUDA_VISIBLE_DEVICES in pod %s/%s: %s", namespace, podName, cudaDevices) + return nil +} + +// ValidateResourceSlice validates ResourceSlice for GPU node +func (gv *GPUValidator) ValidateResourceSlice(ctx context.Context, nodeName string) (*resourceapi.ResourceSlice, error) { + framework.Logf("Validating ResourceSlice for node %s", nodeName) + + // List all ResourceSlices + sliceList, err := gv.client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list ResourceSlices: %w", err) + } + + // Find ResourceSlice for the node + var nodeSlice *resourceapi.ResourceSlice + for i := range sliceList.Items { + slice := &sliceList.Items[i] + if slice.Spec.NodeName != nil && *slice.Spec.NodeName == nodeName { + nodeSlice = slice + break + } + } + + if nodeSlice == nil { + return nil, fmt.Errorf("no ResourceSlice found for node %s", nodeName) + } + + framework.Logf("Found ResourceSlice %s for node %s with driver %s", + nodeSlice.Name, nodeName, nodeSlice.Spec.Driver) + + // Validate that it contains GPU devices + if nodeSlice.Spec.Devices == nil || len(nodeSlice.Spec.Devices) == 0 { + return nil, fmt.Errorf("ResourceSlice %s has no devices", nodeSlice.Name) + } + + framework.Logf("ResourceSlice %s has %d device(s)", nodeSlice.Name, len(nodeSlice.Spec.Devices)) + + return nodeSlice, nil +} + +// ValidateDeviceAllocation validates that claim is properly allocated +func (gv *GPUValidator) ValidateDeviceAllocation(ctx context.Context, namespace, claimName string) error { + framework.Logf("Validating ResourceClaim allocation for %s/%s", namespace, claimName) + + claim, err := gv.client.ResourceV1().ResourceClaims(namespace).Get(ctx, claimName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get ResourceClaim %s/%s: %w", namespace, claimName, err) + } + + // Check if claim is allocated + if claim.Status.Allocation == nil { + return fmt.Errorf("ResourceClaim %s/%s is not allocated", namespace, claimName) + } + + framework.Logf("ResourceClaim %s/%s is allocated", namespace, claimName) + + // Validate devices are allocated + deviceCount := len(claim.Status.Allocation.Devices.Results) + + if deviceCount == 0 { + return fmt.Errorf("ResourceClaim %s/%s has 0 devices allocated", namespace, claimName) + } + + framework.Logf("ResourceClaim %s/%s has %d device(s) allocated", namespace, claimName, deviceCount) + + return nil +} + +// GetGPUNodes returns nodes with NVIDIA GPUs +func (gv *GPUValidator) GetGPUNodes(ctx context.Context) ([]corev1.Node, error) { + framework.Logf("Getting GPU-enabled nodes") + + nodeList, err := gv.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: gpuPresentLabel + "=true", + }) + if err != nil { + return nil, fmt.Errorf("failed to list nodes with GPU: %w", err) + } + + if len(nodeList.Items) == 0 { + // Try without label selector, and filter manually + allNodes, err := gv.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list all nodes: %w", err) + } + + var gpuNodes []corev1.Node + for _, node := range allNodes.Items { + // Check for GPU-related labels or capacity + if gv.hasGPUCapability(&node) { + gpuNodes = append(gpuNodes, node) + } + } + + if len(gpuNodes) == 0 { + return nil, fmt.Errorf("no GPU-enabled nodes found in the cluster") + } + + framework.Logf("Found %d GPU-enabled node(s)", len(gpuNodes)) + return gpuNodes, nil + } + + framework.Logf("Found %d GPU-enabled node(s)", len(nodeList.Items)) + return nodeList.Items, nil +} + +// hasGPUCapability checks if a node has GPU capability +// GetTotalGPUCount returns the total number of GPUs available in the cluster +// by counting devices in ResourceSlices +func (gv *GPUValidator) GetTotalGPUCount(ctx context.Context) (int, error) { + framework.Logf("Counting total GPUs in cluster via ResourceSlices") + + // List all ResourceSlices for GPU driver + sliceList, err := gv.client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return 0, fmt.Errorf("failed to list ResourceSlices: %w", err) + } + + totalGPUs := 0 + for _, slice := range sliceList.Items { + // Count devices from gpu.nvidia.com driver + if slice.Spec.Driver == "gpu.nvidia.com" { + totalGPUs += len(slice.Spec.Devices) + } + } + + framework.Logf("Found %d total GPU(s) in cluster", totalGPUs) + return totalGPUs, nil +} + +func (gv *GPUValidator) hasGPUCapability(node *corev1.Node) bool { + // Check for common GPU labels + gpuLabels := []string{ + gpuPresentLabel, + "nvidia.com/gpu", + "nvidia.com/gpu.count", + "feature.node.kubernetes.io/pci-10de.present", // NVIDIA vendor ID + } + + for _, label := range gpuLabels { + if _, exists := node.Labels[label]; exists { + return true + } + } + + // Check for GPU in allocatable resources + if qty, exists := node.Status.Allocatable["nvidia.com/gpu"]; exists { + if !qty.IsZero() { + return true + } + } + + return false +} + +// ValidateCDISpec validates CDI specification was created +func (gv *GPUValidator) ValidateCDISpec(ctx context.Context, podName, namespace string) error { + framework.Logf("Validating CDI spec for pod %s/%s", namespace, podName) + + pod, err := gv.client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get pod %s/%s: %w", namespace, podName, err) + } + + // Check for CDI annotations or device references + // CDI devices are typically injected via annotations or OCI spec + for key, value := range pod.Annotations { + if strings.Contains(key, "cdi") || strings.Contains(key, "device") { + framework.Logf("Found CDI-related annotation: %s=%s", key, value) + } + } + + // Validate that nvidia device files are present in the container + pod, err = gv.client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get pod: %w", err) + } + + lsCmd := []string{"ls", "-la", "/dev/nvidia*"} + stdout, stderr, err := e2epod.ExecCommandInContainerWithFullOutput( + gv.framework, + podName, + pod.Spec.Containers[0].Name, + lsCmd..., + ) + output := stdout + stderr + if err != nil { + // It's okay if this fails, as device paths may vary + framework.Logf("Warning: Could not list /dev/nvidia* devices: %v", err) + return nil + } + + framework.Logf("NVIDIA devices in pod %s/%s:\n%s", namespace, podName, output) + return nil +} + +// GetGPUCountInPod returns the number of GPUs visible in a pod +func (gv *GPUValidator) GetGPUCountInPod(ctx context.Context, namespace, podName string) (int, error) { + pod, err := gv.client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return 0, fmt.Errorf("failed to get pod %s/%s: %w", namespace, podName, err) + } + + // Exec nvidia-smi to count GPUs + nvidiaSmiCmd := []string{"nvidia-smi", "--query-gpu=count", "--format=csv,noheader"} + stdout, stderr, err := e2epod.ExecCommandInContainerWithFullOutput( + gv.framework, + podName, + pod.Spec.Containers[0].Name, + nvidiaSmiCmd..., + ) + output := stdout + stderr + if err != nil { + return 0, fmt.Errorf("failed to execute nvidia-smi: %w", err) + } + + // Parse the first line to get count + lines := strings.Split(strings.TrimSpace(output), "\n") + if len(lines) == 0 { + return 0, fmt.Errorf("no output from nvidia-smi") + } + + count, err := strconv.Atoi(strings.TrimSpace(lines[0])) + if err != nil { + return 0, fmt.Errorf("failed to parse GPU count from nvidia-smi output: %w", err) + } + + return count, nil +} diff --git a/test/extended/node/dra/nvidia/nvidia_dra.go b/test/extended/node/dra/nvidia/nvidia_dra.go new file mode 100644 index 000000000000..8263c9a1904f --- /dev/null +++ b/test/extended/node/dra/nvidia/nvidia_dra.go @@ -0,0 +1,288 @@ +package nvidia + +import ( + "context" + "fmt" + "sync" + "time" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + admissionapi "k8s.io/pod-security-admission/api" + "k8s.io/utils/ptr" + + exutil "github.com/openshift/origin/test/extended/util" +) + +var ( + deviceClassGVR = schema.GroupVersionResource{ + Group: "resource.k8s.io", + Version: "v1", + Resource: "deviceclasses", + } + resourceClaimGVR = schema.GroupVersionResource{ + Group: "resource.k8s.io", + Version: "v1", + Resource: "resourceclaims", + } + + // Global state for prerequisites installation + prerequisitesOnce sync.Once + prerequisitesInstalled bool + prerequisitesError error +) + +var _ = g.Describe("[sig-scheduling] NVIDIA DRA", func() { + defer g.GinkgoRecover() + + oc := exutil.NewCLIWithPodSecurityLevel("nvidia-dra", admissionapi.LevelPrivileged) + + var ( + prereqInstaller *PrerequisitesInstaller + validator *GPUValidator + builder *ResourceBuilder + ) + + g.BeforeEach(func(ctx context.Context) { + // Initialize helpers + validator = NewGPUValidator(oc.KubeFramework()) + builder = NewResourceBuilder(oc.Namespace()) + prereqInstaller = NewPrerequisitesInstaller(oc.KubeFramework()) + + // IMPORTANT: Check for GPU nodes FIRST before attempting installation + // This ensures tests skip cleanly on non-GPU clusters + nodes, err := validator.GetGPUNodes(ctx) + if err != nil || len(nodes) == 0 { + g.Skip("No GPU nodes available in the cluster - skipping NVIDIA DRA tests") + } + framework.Logf("Found %d GPU node(s) available for testing", len(nodes)) + + // Install prerequisites if needed (runs once via sync.Once) + // NOTE: GPU Operator must be pre-installed on the cluster + // Tests will validate GPU Operator presence and install DRA driver if needed + prerequisitesOnce.Do(func() { + framework.Logf("Checking NVIDIA GPU stack prerequisites") + + // Check if prerequisites are already installed + if prereqInstaller.IsGPUOperatorInstalled(ctx) && prereqInstaller.IsDRADriverInstalled(ctx) { + framework.Logf("Prerequisites already installed, skipping installation") + prerequisitesInstalled = true + return + } + + framework.Logf("Validating GPU Operator and installing DRA driver if needed...") + // Validate GPU Operator presence and install DRA driver + if err := prereqInstaller.InstallAll(ctx); err != nil { + prerequisitesError = err + framework.Logf("ERROR: Failed to validate/install prerequisites: %v", err) + framework.Logf("Ensure GPU Operator is installed on the cluster before running these tests") + return + } + + prerequisitesInstalled = true + framework.Logf("Prerequisites validation completed successfully") + }) + + // Verify prerequisites are installed + if prerequisitesError != nil { + g.Fail(fmt.Sprintf("Prerequisites validation failed: %v. Ensure GPU Operator is installed on cluster.", prerequisitesError)) + } + if !prerequisitesInstalled { + g.Fail("Prerequisites not installed - cannot run tests") + } + }) + + g.Context("Basic GPU Allocation", func() { + g.It("should allocate single GPU to pod via DRA", func(ctx context.Context) { + deviceClassName := "test-nvidia-gpu-" + oc.Namespace() + claimName := "test-gpu-claim" + podName := "test-gpu-pod" + + g.By("Creating DeviceClass for NVIDIA GPUs") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err, "Failed to create DeviceClass") + defer func() { + deleteDeviceClass(oc.KubeFramework().DynamicClient, deviceClassName) + }() + + g.By("Creating ResourceClaim requesting 1 GPU") + claim := builder.BuildResourceClaim(claimName, deviceClassName, 1) + err = createResourceClaim(oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err, "Failed to create ResourceClaim") + defer func() { + deleteResourceClaim(oc.KubeFramework().DynamicClient, oc.Namespace(), claimName) + }() + + g.By("Creating Pod using the ResourceClaim") + pod := builder.BuildPodWithClaim(podName, claimName, "") + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err, "Failed to create pod") + + g.By("Waiting for pod to be running") + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + framework.ExpectNoError(err, "Pod failed to start") + + // Get the updated pod + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Get(ctx, podName, metav1.GetOptions{}) + framework.ExpectNoError(err) + + g.By("Verifying pod is scheduled on GPU node") + err = validator.ValidateGPUInPod(ctx, oc.Namespace(), podName, 1) + framework.ExpectNoError(err) + + g.By("Validating CDI device injection") + err = validator.ValidateCDISpec(ctx, podName, oc.Namespace()) + framework.ExpectNoError(err) + }) + + g.It("should handle pod deletion and resource cleanup", func(ctx context.Context) { + deviceClassName := "test-nvidia-gpu-cleanup-" + oc.Namespace() + claimName := "test-gpu-claim-cleanup" + podName := "test-gpu-pod-cleanup" + + g.By("Creating DeviceClass") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err) + defer deleteDeviceClass(oc.KubeFramework().DynamicClient, deviceClassName) + + g.By("Creating ResourceClaim") + claim := builder.BuildResourceClaim(claimName, deviceClassName, 1) + err = createResourceClaim(oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err) + defer deleteResourceClaim(oc.KubeFramework().DynamicClient, oc.Namespace(), claimName) + + g.By("Creating and verifying pod with GPU") + pod := builder.BuildLongRunningPodWithClaim(podName, claimName, "") + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + framework.ExpectNoError(err) + + g.By("Deleting pod") + err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, podName, metav1.DeleteOptions{}) + framework.ExpectNoError(err) + + g.By("Waiting for pod to be deleted") + err = e2epod.WaitForPodNotFoundInNamespace(ctx, oc.KubeFramework().ClientSet, podName, oc.Namespace(), 1*time.Minute) + framework.ExpectNoError(err) + + g.By("Verifying ResourceClaim still exists but is not reserved") + claimObj, err := oc.KubeFramework().DynamicClient.Resource(resourceClaimGVR).Namespace(oc.Namespace()).Get(ctx, claimName, metav1.GetOptions{}) + framework.ExpectNoError(err) + o.Expect(claimObj).NotTo(o.BeNil()) + + framework.Logf("ResourceClaim %s successfully cleaned up after pod deletion", claimName) + }) + }) + + g.Context("Multi-GPU Workloads", func() { + g.It("should allocate multiple GPUs to single pod", func(ctx context.Context) { + // Check if cluster has at least 2 GPUs before running test + totalGPUs, gpuCountErr := validator.GetTotalGPUCount(ctx) + if gpuCountErr != nil { + framework.Logf("Warning: Could not count total GPUs: %v", gpuCountErr) + } + if totalGPUs < 2 { + g.Skip(fmt.Sprintf("Multi-GPU test requires at least 2 GPUs, but only %d GPU(s) available in cluster", totalGPUs)) + } + + deviceClassName := "test-nvidia-multi-gpu-" + oc.Namespace() + claimName := "test-multi-gpu-claim" + podName := "test-multi-gpu-pod" + + g.By("Creating DeviceClass") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err) + defer deleteDeviceClass(oc.KubeFramework().DynamicClient, deviceClassName) + + g.By("Creating ResourceClaim requesting 2 GPUs") + claim := builder.BuildMultiGPUClaim(claimName, deviceClassName, 2) + err = createResourceClaim(oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err) + defer deleteResourceClaim(oc.KubeFramework().DynamicClient, oc.Namespace(), claimName) + + g.By("Creating Pod using the multi-GPU claim") + pod := builder.BuildPodWithClaim(podName, claimName, "") + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + g.By("Waiting for pod to be running or checking for insufficient resources") + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + if err != nil { + // Check if it's a scheduling error due to insufficient GPUs + pod, getErr := oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Get(ctx, podName, metav1.GetOptions{}) + if getErr == nil && pod.Status.Phase == "Pending" { + framework.Logf("Pod is pending - likely due to insufficient GPU resources. This is expected if cluster doesn't have 2 GPUs available on a single node.") + g.Skip("Insufficient GPU resources for multi-GPU test") + } + framework.ExpectNoError(err, "Pod failed to start") + } + + g.By("Verifying 2 GPUs allocated") + err = validator.ValidateDeviceAllocation(ctx, oc.Namespace(), claimName) + framework.ExpectNoError(err) + + g.By("Verifying 2 GPUs accessible in pod") + time.Sleep(10 * time.Second) + err = validator.ValidateGPUInPod(ctx, oc.Namespace(), podName, 2) + if err != nil { + framework.Logf("Warning: Could not validate 2 GPUs in pod: %v", err) + // Don't fail the test if nvidia-smi fails, as it might be a configuration issue + } + }) + }) +}) + +// Helper functions for creating and deleting resources + +func convertToUnstructured(obj interface{}) (*unstructured.Unstructured, error) { + unstructuredObj := &unstructured.Unstructured{} + content, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil, err + } + unstructuredObj.Object = content + return unstructuredObj, nil +} + +func createDeviceClass(client dynamic.Interface, deviceClass interface{}) error { + unstructuredObj, err := convertToUnstructured(deviceClass) + if err != nil { + return err + } + _, err = client.Resource(deviceClassGVR).Create(context.TODO(), unstructuredObj, metav1.CreateOptions{}) + return err +} + +func deleteDeviceClass(client dynamic.Interface, name string) error { + return client.Resource(deviceClassGVR).Delete(context.TODO(), name, metav1.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) +} + +func createResourceClaim(client dynamic.Interface, namespace string, claim interface{}) error { + unstructuredObj, err := convertToUnstructured(claim) + if err != nil { + return err + } + _, err = client.Resource(resourceClaimGVR).Namespace(namespace).Create(context.TODO(), unstructuredObj, metav1.CreateOptions{}) + return err +} + +func deleteResourceClaim(client dynamic.Interface, namespace, name string) error { + return client.Resource(resourceClaimGVR).Namespace(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) +} diff --git a/test/extended/node/dra/nvidia/prerequisites_installer.go b/test/extended/node/dra/nvidia/prerequisites_installer.go new file mode 100644 index 000000000000..c79a94368dae --- /dev/null +++ b/test/extended/node/dra/nvidia/prerequisites_installer.go @@ -0,0 +1,506 @@ +package nvidia + +import ( + "context" + "fmt" + "os/exec" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" +) + +const ( + // GPU Operator namespace (used for validation only) + gpuOperatorNamespace = "nvidia-gpu-operator" + + // DRA Driver constants + draDriverNamespace = "nvidia-dra-driver-gpu" + draDriverRelease = "nvidia-dra-driver-gpu" + draDriverChart = "nvidia/nvidia-dra-driver-gpu" + draDriverControllerSA = "nvidia-dra-driver-gpu-service-account-controller" + draDriverKubeletPluginSA = "nvidia-dra-driver-gpu-service-account-kubeletplugin" + draDriverComputeDomainSA = "compute-domain-daemon-service-account" +) + +// PrerequisitesInstaller validates GPU Operator and manages DRA driver installation +type PrerequisitesInstaller struct { + client kubernetes.Interface +} + +// NewPrerequisitesInstaller creates a new installer +func NewPrerequisitesInstaller(f *framework.Framework) *PrerequisitesInstaller { + return &PrerequisitesInstaller{ + client: f.ClientSet, + } +} + +// InstallAll validates GPU Operator is present and installs DRA Driver +func (pi *PrerequisitesInstaller) InstallAll(ctx context.Context) error { + framework.Logf("=== Validating NVIDIA GPU Stack Prerequisites ===") + + // Step 1: Validate GPU Operator is already installed + framework.Logf("Checking if GPU Operator is installed...") + if !pi.IsGPUOperatorInstalled(ctx) { + return fmt.Errorf("GPU Operator not found - must be pre-installed on the cluster. " + + "Install GPU Operator via OLM before running these tests") + } + framework.Logf("GPU Operator detected") + + // Step 2: Wait for GPU Operator to be ready + framework.Logf("Waiting for GPU Operator to be ready...") + if err := pi.WaitForGPUOperator(ctx, 5*time.Minute); err != nil { + return fmt.Errorf("GPU Operator not ready: %w. Ensure GPU Operator is fully deployed", err) + } + framework.Logf("GPU Operator is ready") + + // Step 3: Check if DRA Driver already installed (skip if present) + if pi.IsDRADriverInstalled(ctx) { + framework.Logf("DRA Driver already installed, skipping installation") + } else { + // Step 4: Ensure Helm is available + if err := pi.ensureHelm(ctx); err != nil { + return fmt.Errorf("helm not available: %w", err) + } + + // Step 5: Add NVIDIA Helm repository + if err := pi.addHelmRepoForDRADriver(ctx); err != nil { + return fmt.Errorf("failed to add Helm repository: %w", err) + } + + // Step 6: Install DRA Driver (latest version) + if err := pi.InstallDRADriver(ctx); err != nil { + return fmt.Errorf("failed to install DRA Driver: %w", err) + } + } + + // Step 7: Wait for DRA Driver to be ready + framework.Logf("Waiting for DRA Driver to be ready...") + if err := pi.WaitForDRADriver(ctx, 5*time.Minute); err != nil { + return fmt.Errorf("DRA Driver failed to become ready: %w", err) + } + + framework.Logf("=== All prerequisites validated and ready ===") + return nil +} + +// ensureHelm checks if Helm is available +func (pi *PrerequisitesInstaller) ensureHelm(ctx context.Context) error { + cmd := exec.CommandContext(ctx, "helm", "version", "--short") + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("helm command not found or failed: %w\nOutput: %s", err, string(output)) + } + framework.Logf("Helm version: %s", strings.TrimSpace(string(output))) + return nil +} + +// addHelmRepoForDRADriver adds NVIDIA Helm repository for DRA driver installation +func (pi *PrerequisitesInstaller) addHelmRepoForDRADriver(ctx context.Context) error { + framework.Logf("Adding NVIDIA Helm repository for DRA driver") + + // Add repo + cmd := exec.CommandContext(ctx, "helm", "repo", "add", "nvidia", "https://nvidia.github.io/gpu-operator") + output, err := cmd.CombinedOutput() + if err != nil && !strings.Contains(string(output), "already exists") { + return fmt.Errorf("failed to add helm repo: %w\nOutput: %s", err, string(output)) + } + + // Update repo + cmd = exec.CommandContext(ctx, "helm", "repo", "update") + output, err = cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to update helm repo: %w\nOutput: %s", err, string(output)) + } + + framework.Logf("NVIDIA Helm repository added and updated") + return nil +} + +// InstallDRADriver installs NVIDIA DRA Driver via Helm (latest version) +func (pi *PrerequisitesInstaller) InstallDRADriver(ctx context.Context) error { + framework.Logf("Installing NVIDIA DRA Driver (latest version)") + + // Create namespace + if err := pi.createNamespace(ctx, draDriverNamespace); err != nil { + return err + } + + // Label GPU nodes for DRA kubelet plugin scheduling + if err := pi.labelGPUNodesForDRA(ctx); err != nil { + return fmt.Errorf("failed to label GPU nodes: %w", err) + } + + // Grant SCC permissions + if err := pi.grantSCCPermissions(ctx); err != nil { + return fmt.Errorf("failed to grant SCC permissions: %w", err) + } + + // Check if already installed + if pi.isHelmReleaseInstalled(ctx, draDriverRelease, draDriverNamespace) { + framework.Logf("DRA Driver already installed, skipping") + return nil + } + + // Build Helm install command + args := []string{ + "install", draDriverRelease, draDriverChart, + "--namespace", draDriverNamespace, + "--set", "nvidiaDriverRoot=/run/nvidia/driver", + "--set", "gpuResourcesEnabledOverride=true", + "--set", "image.pullPolicy=IfNotPresent", + "--set-string", "kubeletPlugin.nodeSelector.nvidia\\.com/dra-kubelet-plugin=true", + "--set", "controller.tolerations[0].key=node-role.kubernetes.io/master", + "--set", "controller.tolerations[0].operator=Exists", + "--set", "controller.tolerations[0].effect=NoSchedule", + "--set", "controller.tolerations[1].key=node-role.kubernetes.io/control-plane", + "--set", "controller.tolerations[1].operator=Exists", + "--set", "controller.tolerations[1].effect=NoSchedule", + "--wait", + "--timeout", "5m", + } + + cmd := exec.CommandContext(ctx, "helm", args...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to install DRA Driver: %w\nOutput: %s", err, string(output)) + } + + framework.Logf("DRA Driver installed successfully") + return nil +} + +// WaitForGPUOperator waits for GPU Operator to be ready +func (pi *PrerequisitesInstaller) WaitForGPUOperator(ctx context.Context, timeout time.Duration) error { + framework.Logf("Waiting for GPU Operator to be ready (timeout: %v)", timeout) + + // Wait for driver daemonset (using prefix matching to support different installation methods) + if err := pi.waitForDaemonSetByPrefix(ctx, gpuOperatorNamespace, "nvidia-driver-daemonset", timeout); err != nil { + return fmt.Errorf("driver daemonset not ready: %w", err) + } + + // Wait for device plugin daemonset (using prefix matching to support different installation methods) + if err := pi.waitForDaemonSetByPrefix(ctx, gpuOperatorNamespace, "nvidia-device-plugin-daemonset", timeout); err != nil { + return fmt.Errorf("device plugin daemonset not ready: %w", err) + } + + // Wait for GPU nodes to be labeled by NFD + if err := pi.waitForGPUNodes(ctx, timeout); err != nil { + return fmt.Errorf("no GPU nodes labeled: %w", err) + } + + framework.Logf("GPU Operator is ready") + return nil +} + +// WaitForDRADriver waits for DRA Driver to be ready +func (pi *PrerequisitesInstaller) WaitForDRADriver(ctx context.Context, timeout time.Duration) error { + framework.Logf("Waiting for DRA Driver to be ready (timeout: %v)", timeout) + + // Wait for controller deployment + if err := pi.waitForDeployment(ctx, draDriverNamespace, draDriverRelease+"-controller", timeout); err != nil { + return fmt.Errorf("controller deployment not ready: %w", err) + } + + // Wait for kubelet plugin daemonset + if err := pi.waitForDaemonSet(ctx, draDriverNamespace, draDriverRelease+"-kubelet-plugin", timeout); err != nil { + return fmt.Errorf("kubelet plugin daemonset not ready: %w", err) + } + + framework.Logf("DRA Driver is ready") + return nil +} + +// UninstallAll uninstalls DRA Driver (GPU Operator is cluster infrastructure, not removed) +func (pi *PrerequisitesInstaller) UninstallAll(ctx context.Context) error { + framework.Logf("=== Cleaning up NVIDIA DRA Driver ===") + + // Only uninstall DRA Driver (GPU Operator is cluster infrastructure) + if err := pi.UninstallDRADriver(ctx); err != nil { + framework.Logf("Warning: failed to uninstall DRA Driver: %v", err) + } + + framework.Logf("=== Cleanup complete ===") + return nil +} + +// UninstallDRADriver uninstalls DRA Driver +func (pi *PrerequisitesInstaller) UninstallDRADriver(ctx context.Context) error { + framework.Logf("Uninstalling DRA Driver") + + cmd := exec.CommandContext(ctx, "helm", "uninstall", draDriverRelease, + "--namespace", draDriverNamespace, + "--wait", + "--timeout", "5m") + + output, err := cmd.CombinedOutput() + if err != nil && !strings.Contains(string(output), "not found") { + return fmt.Errorf("failed to uninstall DRA Driver: %w\nOutput: %s", err, string(output)) + } + + // Delete namespace + if err := pi.client.CoreV1().Namespaces().Delete(ctx, draDriverNamespace, metav1.DeleteOptions{}); err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete namespace: %w", err) + } + } + + framework.Logf("DRA Driver uninstalled") + return nil +} + +// Helper methods + +func (pi *PrerequisitesInstaller) createNamespace(ctx context.Context, name string) error { + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + } + _, err := pi.client.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create namespace %s: %w", name, err) + } + framework.Logf("Namespace %s created or already exists", name) + return nil +} + +func (pi *PrerequisitesInstaller) labelGPUNodesForDRA(ctx context.Context) error { + framework.Logf("Labeling GPU nodes for DRA kubelet plugin") + + nodes, err := pi.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: "nvidia.com/gpu.present=true", + }) + if err != nil { + return fmt.Errorf("failed to list GPU nodes: %w", err) + } + + if len(nodes.Items) == 0 { + return fmt.Errorf("no GPU nodes found to label") + } + + for _, node := range nodes.Items { + if node.Labels == nil { + node.Labels = make(map[string]string) + } + + if node.Labels["nvidia.com/dra-kubelet-plugin"] == "true" { + framework.Logf("Node %s already has DRA kubelet plugin label", node.Name) + continue + } + + node.Labels["nvidia.com/dra-kubelet-plugin"] = "true" + _, err := pi.client.CoreV1().Nodes().Update(ctx, &node, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to label node %s: %w", node.Name, err) + } + framework.Logf("Labeled GPU node %s with nvidia.com/dra-kubelet-plugin=true", node.Name) + } + + return nil +} + +func (pi *PrerequisitesInstaller) grantSCCPermissions(ctx context.Context) error { + framework.Logf("Granting SCC permissions to DRA driver service accounts") + + serviceAccounts := []string{ + draDriverControllerSA, + draDriverKubeletPluginSA, + draDriverComputeDomainSA, + } + + for _, sa := range serviceAccounts { + // Create ClusterRoleBinding to grant privileged SCC + crb := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("nvidia-dra-privileged-%s", sa), + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: "system:openshift:scc:privileged", + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: sa, + Namespace: draDriverNamespace, + }, + }, + } + + _, err := pi.client.RbacV1().ClusterRoleBindings().Create(ctx, crb, metav1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create ClusterRoleBinding for %s: %w", sa, err) + } + framework.Logf("SCC permissions granted to %s", sa) + } + + return nil +} + +func (pi *PrerequisitesInstaller) isHelmReleaseInstalled(ctx context.Context, release, namespace string) bool { + cmd := exec.CommandContext(ctx, "helm", "status", release, "--namespace", namespace) + err := cmd.Run() + return err == nil +} + +func (pi *PrerequisitesInstaller) waitForDaemonSet(ctx context.Context, namespace, name string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + ds, err := pi.client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + framework.Logf("DaemonSet %s/%s not found yet", namespace, name) + return false, nil + } + return false, err + } + + ready := ds.Status.DesiredNumberScheduled > 0 && + ds.Status.NumberReady == ds.Status.DesiredNumberScheduled && + ds.Status.NumberUnavailable == 0 + + if !ready { + framework.Logf("DaemonSet %s/%s not ready: desired=%d, ready=%d, unavailable=%d", + namespace, name, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady, ds.Status.NumberUnavailable) + } + + return ready, nil + }) +} + +func (pi *PrerequisitesInstaller) waitForDaemonSetByPrefix(ctx context.Context, namespace, namePrefix string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + dsList, err := pi.client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return false, err + } + + for _, ds := range dsList.Items { + if strings.HasPrefix(ds.Name, namePrefix) { + ready := ds.Status.DesiredNumberScheduled > 0 && + ds.Status.NumberReady == ds.Status.DesiredNumberScheduled && + ds.Status.NumberUnavailable == 0 + + if !ready { + framework.Logf("DaemonSet %s/%s not ready: desired=%d, ready=%d, unavailable=%d", + namespace, ds.Name, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady, ds.Status.NumberUnavailable) + return false, nil + } + + framework.Logf("DaemonSet %s/%s is ready", namespace, ds.Name) + return true, nil + } + } + + framework.Logf("DaemonSet with prefix %s/%s not found yet", namespace, namePrefix) + return false, nil + }) +} + +func (pi *PrerequisitesInstaller) waitForDeployment(ctx context.Context, namespace, name string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + deploy, err := pi.client.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + framework.Logf("Deployment %s/%s not found yet", namespace, name) + return false, nil + } + return false, err + } + + ready := deploy.Status.Replicas > 0 && + deploy.Status.ReadyReplicas == deploy.Status.Replicas + + if !ready { + framework.Logf("Deployment %s/%s not ready: replicas=%d, ready=%d", + namespace, name, deploy.Status.Replicas, deploy.Status.ReadyReplicas) + } + + return ready, nil + }) +} + +func (pi *PrerequisitesInstaller) waitForGPUNodes(ctx context.Context, timeout time.Duration) error { + framework.Logf("Waiting for GPU nodes to be labeled by NFD") + + return wait.PollUntilContextTimeout(ctx, 10*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + nodes, err := pi.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: "nvidia.com/gpu.present=true", + }) + if err != nil { + return false, err + } + + if len(nodes.Items) == 0 { + framework.Logf("No GPU nodes labeled yet by NFD") + return false, nil + } + + framework.Logf("Found %d GPU node(s) labeled by NFD", len(nodes.Items)) + for _, node := range nodes.Items { + framework.Logf(" - GPU node: %s", node.Name) + } + return true, nil + }) +} + +// IsGPUOperatorInstalled checks if GPU Operator is installed (via Helm or OLM) +func (pi *PrerequisitesInstaller) IsGPUOperatorInstalled(ctx context.Context) bool { + // Check if the namespace exists + _, err := pi.client.CoreV1().Namespaces().Get(ctx, gpuOperatorNamespace, metav1.GetOptions{}) + if err != nil { + return false + } + + // Check if GPU Operator pods are running + pods, err := pi.client.CoreV1().Pods(gpuOperatorNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app=gpu-operator", + }) + if err != nil || len(pods.Items) == 0 { + return false + } + + // Check if at least one pod is running or succeeded + for _, pod := range pods.Items { + if pod.Status.Phase == "Running" || pod.Status.Phase == "Succeeded" { + framework.Logf("Found running GPU Operator pod: %s", pod.Name) + return true + } + } + + return false +} + +// IsDRADriverInstalled checks if DRA Driver is installed (via Helm or other means) +func (pi *PrerequisitesInstaller) IsDRADriverInstalled(ctx context.Context) bool { + // Check if the namespace exists + _, err := pi.client.CoreV1().Namespaces().Get(ctx, draDriverNamespace, metav1.GetOptions{}) + if err != nil { + return false + } + + // Check if DRA kubelet plugin pods are running + pods, err := pi.client.CoreV1().Pods(draDriverNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/name=nvidia-dra-driver-gpu", + }) + if err != nil || len(pods.Items) == 0 { + return false + } + + // Check if at least one pod is running + for _, pod := range pods.Items { + if pod.Status.Phase == "Running" { + framework.Logf("Found running DRA Driver pod: %s", pod.Name) + return true + } + } + + return false +} diff --git a/test/extended/node/dra/nvidia/resource_builder.go b/test/extended/node/dra/nvidia/resource_builder.go new file mode 100644 index 000000000000..af4e004a7b68 --- /dev/null +++ b/test/extended/node/dra/nvidia/resource_builder.go @@ -0,0 +1,212 @@ +package nvidia + +import ( + corev1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + defaultDeviceClassName = "nvidia-gpu" + resourceBuilderDriver = "gpu.nvidia.com" + defaultCudaImage = "nvcr.io/nvidia/cuda:12.0.0-base-ubuntu22.04" +) + +// ResourceBuilder helps build DRA resource objects +type ResourceBuilder struct { + namespace string +} + +// NewResourceBuilder creates a new builder +func NewResourceBuilder(namespace string) *ResourceBuilder { + return &ResourceBuilder{namespace: namespace} +} + +// BuildDeviceClass creates a DeviceClass for NVIDIA GPUs +func (rb *ResourceBuilder) BuildDeviceClass(name string) *resourceapi.DeviceClass { + if name == "" { + name = defaultDeviceClassName + } + + return &resourceapi.DeviceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: resourceapi.DeviceClassSpec{ + Selectors: []resourceapi.DeviceSelector{ + { + CEL: &resourceapi.CELDeviceSelector{ + Expression: "device.driver == \"" + resourceBuilderDriver + "\"", + }, + }, + }, + }, + } +} + +// BuildResourceClaim creates a ResourceClaim requesting GPUs +func (rb *ResourceBuilder) BuildResourceClaim(name, deviceClassName string, count int) *resourceapi.ResourceClaim { + if deviceClassName == "" { + deviceClassName = defaultDeviceClassName + } + + deviceRequests := []resourceapi.DeviceRequest{ + { + Name: "gpu", + Exactly: &resourceapi.ExactDeviceRequest{ + DeviceClassName: deviceClassName, + Count: int64(count), + }, + }, + } + + return &resourceapi.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: rb.namespace, + }, + Spec: resourceapi.ResourceClaimSpec{ + Devices: resourceapi.DeviceClaim{ + Requests: deviceRequests, + }, + }, + } +} + +// BuildPodWithClaim creates a Pod that uses a ResourceClaim +func (rb *ResourceBuilder) BuildPodWithClaim(name, claimName, image string) *corev1.Pod { + if image == "" { + image = defaultCudaImage + } + + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: rb.namespace, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "gpu-container", + Image: image, + Command: []string{"sh", "-c", "nvidia-smi && sleep infinity"}, + Resources: corev1.ResourceRequirements{ + Claims: []corev1.ResourceClaim{ + { + Name: "gpu", + }, + }, + }, + }, + }, + ResourceClaims: []corev1.PodResourceClaim{ + { + Name: "gpu", + ResourceClaimName: &claimName, + }, + }, + }, + } +} + +// BuildPodWithInlineClaim creates a Pod with inline ResourceClaim +// Note: Inline claims via ResourceClaimTemplate are not directly supported in pod spec +// This creates a pod that references a ResourceClaimTemplateName +func (rb *ResourceBuilder) BuildPodWithInlineClaim(name, deviceClassName string, gpuCount int) *corev1.Pod { + if deviceClassName == "" { + deviceClassName = defaultDeviceClassName + } + + // Note: The actual ResourceClaimTemplate must be created separately + templateName := name + "-template" + + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: rb.namespace, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "gpu-container", + Image: defaultCudaImage, + Command: []string{"sh", "-c", "nvidia-smi && sleep infinity"}, + Resources: corev1.ResourceRequirements{ + Claims: []corev1.ResourceClaim{ + { + Name: "gpu", + }, + }, + }, + }, + }, + ResourceClaims: []corev1.PodResourceClaim{ + { + Name: "gpu", + ResourceClaimTemplateName: &templateName, + }, + }, + }, + } +} + +// BuildPodWithCommand creates a Pod with a custom command +func (rb *ResourceBuilder) BuildPodWithCommand(name, claimName, image string, command []string) *corev1.Pod { + if image == "" { + image = defaultCudaImage + } + + pod := rb.BuildPodWithClaim(name, claimName, image) + pod.Spec.Containers[0].Command = command + return pod +} + +// BuildLongRunningPodWithClaim creates a long-running Pod for testing +func (rb *ResourceBuilder) BuildLongRunningPodWithClaim(name, claimName, image string) *corev1.Pod { + if image == "" { + image = defaultCudaImage + } + + pod := rb.BuildPodWithClaim(name, claimName, image) + pod.Spec.Containers[0].Command = []string{"sh", "-c", "while true; do nvidia-smi; sleep 60; done"} + return pod +} + +// BuildMultiGPUClaim creates a ResourceClaim for multiple GPUs +func (rb *ResourceBuilder) BuildMultiGPUClaim(name, deviceClassName string, gpuCount int) *resourceapi.ResourceClaim { + return rb.BuildResourceClaim(name, deviceClassName, gpuCount) +} + +// BuildSharedClaim creates a shareable ResourceClaim (if supported) +func (rb *ResourceBuilder) BuildSharedClaim(name, deviceClassName string, count int) *resourceapi.ResourceClaim { + claim := rb.BuildResourceClaim(name, deviceClassName, count) + // Add shareable configuration if needed based on NVIDIA driver capabilities + // This may require additional fields in the ResourceClaim spec + return claim +} + +// BuildDeviceClassWithConfig creates a DeviceClass with additional configuration +func (rb *ResourceBuilder) BuildDeviceClassWithConfig(name string, config *resourceapi.DeviceClassConfiguration) *resourceapi.DeviceClass { + dc := rb.BuildDeviceClass(name) + if config != nil { + dc.Spec.Config = []resourceapi.DeviceClassConfiguration{*config} + } + return dc +} + +// BuildDeviceClassWithConstraints creates a DeviceClass with constraints +func (rb *ResourceBuilder) BuildDeviceClassWithConstraints(name, constraints string) *resourceapi.DeviceClass { + dc := rb.BuildDeviceClass(name) + if constraints != "" { + dc.Spec.Selectors = []resourceapi.DeviceSelector{ + { + CEL: &resourceapi.CELDeviceSelector{ + Expression: constraints, + }, + }, + } + } + return dc +} diff --git a/test/extended/node/dra/nvidia/run-tests.sh b/test/extended/node/dra/nvidia/run-tests.sh new file mode 100644 index 000000000000..6eae2815072b --- /dev/null +++ b/test/extended/node/dra/nvidia/run-tests.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# +# CI-friendly test runner for NVIDIA DRA tests +# +# Usage: +# ./run-tests.sh [--junit-dir DIR] [--verbose] +# +# Environment Variables: +# KUBECONFIG - Path to kubeconfig (required) +# JUNIT_DIR - Directory for JUnit XML output (optional) +# VERBOSE - Set to "true" for verbose output (optional) +# + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +JUNIT_DIR="${JUNIT_DIR:-}" +VERBOSE="${VERBOSE:-false}" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --junit-dir) + JUNIT_DIR="$2" + shift 2 + ;; + --verbose) + VERBOSE="true" + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Validate KUBECONFIG +if [ -z "${KUBECONFIG:-}" ]; then + echo "ERROR: KUBECONFIG environment variable must be set" + exit 1 +fi + +if [ ! -f "${KUBECONFIG}" ]; then + echo "ERROR: KUBECONFIG file does not exist: ${KUBECONFIG}" + exit 1 +fi + +# Create JUnit directory if specified +if [ -n "${JUNIT_DIR}" ]; then + mkdir -p "${JUNIT_DIR}" +fi + +echo "======================================" +echo "NVIDIA DRA Test Runner" +echo "======================================" +echo "KUBECONFIG: ${KUBECONFIG}" +echo "JUnit Output: ${JUNIT_DIR:-disabled}" +echo "Verbose: ${VERBOSE}" +echo "" + +# Run the standalone test +if [ "$VERBOSE" == "true" ]; then + exec "${SCRIPT_DIR}/standalone_test.sh" +else + "${SCRIPT_DIR}/standalone_test.sh" 2>&1 +fi + +TEST_EXIT_CODE=$? + +# Generate JUnit XML if directory specified +if [ -n "${JUNIT_DIR}" ] && [ $TEST_EXIT_CODE -eq 0 ]; then + cat > "${JUNIT_DIR}/nvidia-dra-tests.xml" < + + + + + + + + + + + + + + + + +EOF + echo "JUnit XML report generated: ${JUNIT_DIR}/nvidia-dra-tests.xml" +fi + +exit $TEST_EXIT_CODE diff --git a/test/extended/node/dra/nvidia/standalone_test.sh b/test/extended/node/dra/nvidia/standalone_test.sh new file mode 100755 index 000000000000..2637545b3e0b --- /dev/null +++ b/test/extended/node/dra/nvidia/standalone_test.sh @@ -0,0 +1,423 @@ +#!/bin/bash +# +# Standalone test script for NVIDIA DRA validation +# This script validates DRA functionality on OpenShift clusters with GPU nodes +# +# Prerequisites: +# - KUBECONFIG set and pointing to cluster with GPU nodes +# - Helm 3 installed (for automated prerequisite installation) +# - Cluster-admin access +# +# The script will automatically install GPU Operator and DRA Driver if not present +# + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_NAMESPACE="nvidia-dra-e2e-test" +DEVICECLASS_NAME="nvidia-gpu-test-$(date +%s)" +CLAIM_NAME="gpu-claim-test" +POD_NAME="gpu-pod-test" +RESULTS_DIR="${RESULTS_DIR:-/tmp/nvidia-dra-test-results}" + +# Create results directory +mkdir -p "${RESULTS_DIR}" + +echo "======================================" +echo "NVIDIA DRA Standalone Test Suite" +echo "======================================" +echo "Results will be saved to: ${RESULTS_DIR}" +echo "" + +# Test counters +TESTS_RUN=0 +TESTS_PASSED=0 +TESTS_FAILED=0 + +# Test result tracking +declare -a FAILED_TESTS=() + +function log_info() { + echo -e "${GREEN}[INFO]${NC} $*" +} + +function log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" +} + +function log_error() { + echo -e "${RED}[ERROR]${NC} $*" +} + +function test_start() { + TESTS_RUN=$((TESTS_RUN + 1)) + log_info "Test $TESTS_RUN: $1" +} + +function test_passed() { + TESTS_PASSED=$((TESTS_PASSED + 1)) + log_info "✓ PASSED: $1" + echo "" +} + +function test_failed() { + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_TESTS+=("$1") + log_error "✗ FAILED: $1" + if [ -n "${2:-}" ]; then + log_error " Reason: $2" + fi + echo "" +} + +function cleanup() { + log_info "Cleaning up test resources..." + + # Delete pod + oc delete pod ${POD_NAME} -n ${TEST_NAMESPACE} --ignore-not-found=true --wait=false 2>/dev/null || true + + # Delete resourceclaim + oc delete resourceclaim ${CLAIM_NAME} -n ${TEST_NAMESPACE} --ignore-not-found=true 2>&1 | grep -v "the server doesn't have a resource type" || true + + # Delete deviceclass + oc delete deviceclass ${DEVICECLASS_NAME} --ignore-not-found=true 2>&1 | grep -v "the server doesn't have a resource type" || true + + # Delete namespace + oc delete namespace ${TEST_NAMESPACE} --ignore-not-found=true --wait=false 2>/dev/null || true + + log_info "Cleanup complete" +} + +# Set trap for cleanup +trap cleanup EXIT + +############################################################################### +# Test 1: Check and Install Prerequisites +############################################################################### +test_start "Check prerequisites (GPU Operator, DRA Driver, Helm)" + +PREREQS_INSTALLED=true + +# Check if Helm is available +if ! command -v helm &> /dev/null; then + log_warn "Helm not found - automated installation will not work" + log_warn "Please install prerequisites manually or install Helm 3" + PREREQS_INSTALLED=false +fi + +# Check GPU Operator (must be pre-installed via OLM or Helm) +if ! oc get pods -n nvidia-gpu-operator -l app=gpu-operator --no-headers 2>/dev/null | grep -q Running; then + log_error "GPU Operator not detected - must be pre-installed on the cluster" + log_error "Install GPU Operator via OLM before running these tests" + PREREQS_INSTALLED=false +fi + +# Check DRA Driver (check for running pods, not just Helm release) +if ! oc get pods -n nvidia-dra-driver-gpu -l app.kubernetes.io/name=nvidia-dra-driver-gpu --no-headers 2>/dev/null | grep -q Running; then + log_warn "DRA Driver not detected (checking for running pods)" + if command -v helm &> /dev/null; then + log_info "Attempting to install DRA Driver via Helm..." + + # Label GPU nodes for DRA kubelet plugin scheduling + log_info "Labeling GPU nodes with nvidia.com/dra-kubelet-plugin=true" + for node in $(oc get nodes -l nvidia.com/gpu.present=true -o name 2>/dev/null); do + oc label $node nvidia.com/dra-kubelet-plugin=true --overwrite 2>/dev/null || true + done + + oc create namespace nvidia-dra-driver-gpu 2>/dev/null || true + + # Grant SCC permissions + oc adm policy add-scc-to-user privileged \ + -z nvidia-dra-driver-gpu-service-account-controller \ + -n nvidia-dra-driver-gpu 2>/dev/null || true + oc adm policy add-scc-to-user privileged \ + -z nvidia-dra-driver-gpu-service-account-kubeletplugin \ + -n nvidia-dra-driver-gpu 2>/dev/null || true + oc adm policy add-scc-to-user privileged \ + -z compute-domain-daemon-service-account \ + -n nvidia-dra-driver-gpu 2>/dev/null || true + + helm install nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --namespace nvidia-dra-driver-gpu \ + --set nvidiaDriverRoot=/run/nvidia/driver \ + --set gpuResourcesEnabledOverride=true \ + --set image.pullPolicy=IfNotPresent \ + --set-string "kubeletPlugin.nodeSelector.nvidia\.com/dra-kubelet-plugin=true" \ + --set "controller.tolerations[0].key=node-role.kubernetes.io/master" \ + --set "controller.tolerations[0].operator=Exists" \ + --set "controller.tolerations[0].effect=NoSchedule" \ + --set "controller.tolerations[1].key=node-role.kubernetes.io/control-plane" \ + --set "controller.tolerations[1].operator=Exists" \ + --set "controller.tolerations[1].effect=NoSchedule" \ + --wait --timeout 5m || { + log_error "Failed to install DRA Driver" + PREREQS_INSTALLED=false + } + else + PREREQS_INSTALLED=false + fi +fi + +if [ "$PREREQS_INSTALLED" = false ]; then + test_failed "Prerequisites not installed" "Please install GPU Operator and DRA Driver manually" + exit 1 +fi + +# Verify GPU nodes +GPU_NODE=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [ -z "$GPU_NODE" ]; then + test_failed "No GPU nodes found" "No nodes with label nvidia.com/gpu.present=true" + exit 1 +fi + +# Check ResourceSlices (DRA driver publishes these) +RESOURCE_SLICES=$(oc get resourceslices --no-headers 2>/dev/null | wc -l) +if [ "$RESOURCE_SLICES" -eq 0 ]; then + test_failed "No ResourceSlices published" "DRA driver may not be running correctly" + exit 1 +fi + +test_passed "Prerequisites verified (GPU Node: $GPU_NODE, ResourceSlices: $RESOURCE_SLICES)" + +############################################################################### +# Test 2: Create Test Namespace +############################################################################### +test_start "Create test namespace: $TEST_NAMESPACE" + +if oc create namespace ${TEST_NAMESPACE}; then + # Label namespace with privileged pod security level (matches test code) + oc label namespace ${TEST_NAMESPACE} \ + pod-security.kubernetes.io/enforce=privileged \ + pod-security.kubernetes.io/audit=privileged \ + pod-security.kubernetes.io/warn=privileged 2>/dev/null || true + test_passed "Test namespace created with privileged security level" +else + test_failed "Failed to create test namespace" + exit 1 +fi + +############################################################################### +# Test 3: Create DeviceClass +############################################################################### +test_start "Create DeviceClass: $DEVICECLASS_NAME" + +cat </dev/null +apiVersion: resource.k8s.io/v1 +kind: DeviceClass +metadata: + name: ${DEVICECLASS_NAME} +spec: + selectors: + - cel: + expression: device.driver == "gpu.nvidia.com" +EOF + +if [ $? -eq 0 ]; then + test_passed "DeviceClass created" +else + test_failed "Failed to create DeviceClass" + exit 1 +fi + +############################################################################### +# Test 4: Create ResourceClaim +############################################################################### +test_start "Create ResourceClaim: $CLAIM_NAME" + +# This matches the v1 API format used in resource_builder.go +cat </dev/null +apiVersion: resource.k8s.io/v1 +kind: ResourceClaim +metadata: + name: ${CLAIM_NAME} + namespace: ${TEST_NAMESPACE} +spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: ${DEVICECLASS_NAME} + count: 1 +EOF + +if [ $? -eq 0 ]; then + test_passed "ResourceClaim created" +else + test_failed "Failed to create ResourceClaim" + exit 1 +fi + +############################################################################### +# Test 5: Create Pod with ResourceClaim +############################################################################### +test_start "Create Pod using ResourceClaim" + +# This matches the pod pattern in resource_builder.go (sleep infinity for long-running) +cat </dev/null +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME} + namespace: ${TEST_NAMESPACE} +spec: + restartPolicy: Never + containers: + - name: gpu-container + image: nvcr.io/nvidia/cuda:12.0.0-base-ubuntu22.04 + command: ["sh", "-c", "nvidia-smi && sleep 300"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimName: ${CLAIM_NAME} +EOF + +if [ $? -eq 0 ]; then + test_passed "Pod created" +else + test_failed "Failed to create pod" + exit 1 +fi + +############################################################################### +# Test 6: Wait for Pod to be Running +############################################################################### +test_start "Wait for pod to be running (max 2 minutes)" + +TIMEOUT=120 +ELAPSED=0 +POD_STATUS="" + +while [ $ELAPSED -lt $TIMEOUT ]; do + POD_STATUS=$(oc get pod ${POD_NAME} -n ${TEST_NAMESPACE} -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + + if [ "$POD_STATUS" == "Running" ]; then + break + elif [ "$POD_STATUS" == "Succeeded" ]; then + break + elif [ "$POD_STATUS" == "Failed" ]; then + break + elif [ "$POD_STATUS" == "NotFound" ]; then + test_failed "Pod disappeared" + break + fi + + sleep 5 + ELAPSED=$((ELAPSED + 5)) + echo -n "." +done +echo "" + +if [ "$POD_STATUS" == "Running" ] || [ "$POD_STATUS" == "Succeeded" ]; then + test_passed "Pod is running/completed successfully" +else + test_failed "Pod did not start successfully (Status: $POD_STATUS)" + log_info "Pod events:" + oc get events -n ${TEST_NAMESPACE} --field-selector involvedObject.name=${POD_NAME} 2>&1 || true +fi + +############################################################################### +# Test 7: Verify GPU Access in Pod +############################################################################### +test_start "Verify GPU accessibility via nvidia-smi" + +# Wait a moment for nvidia-smi to complete +sleep 5 + +POD_LOGS=$(oc logs ${POD_NAME} -n ${TEST_NAMESPACE} 2>/dev/null || echo "") + +if echo "$POD_LOGS" | grep -q "NVIDIA-SMI"; then + test_passed "GPU was accessible via DRA" + log_info "Pod output:" + echo "$POD_LOGS" | sed 's/^/ /' +else + test_failed "GPU was not accessible in pod" + log_info "Pod logs:" + echo "$POD_LOGS" | sed 's/^/ /' +fi + +############################################################################### +# Test 8: Verify ResourceClaim Allocation +############################################################################### +test_start "Verify ResourceClaim was allocated" + +CLAIM_STATUS=$(oc get resourceclaim ${CLAIM_NAME} -n ${TEST_NAMESPACE} -o jsonpath='{.status.allocation}' 2>/dev/null || echo "") + +if [ -n "$CLAIM_STATUS" ]; then + test_passed "ResourceClaim was allocated" + ALLOCATED_DEVICE=$(oc get resourceclaim ${CLAIM_NAME} -n ${TEST_NAMESPACE} -o jsonpath='{.status.allocation.devices.results[0].device}' 2>/dev/null || echo "unknown") + log_info "Allocated device: $ALLOCATED_DEVICE" +else + log_warn "ResourceClaim allocation status not available" +fi + +############################################################################### +# Test 9: ResourceClaim Lifecycle - Pod Deletion +############################################################################### +test_start "Delete pod and verify ResourceClaim cleanup" + +# Delete pod +if oc delete pod ${POD_NAME} -n ${TEST_NAMESPACE} --wait=true --timeout=60s &>/dev/null; then + log_info "Pod deleted" +else + log_warn "Pod deletion timed out or failed" +fi + +# Wait for pod to be fully removed +sleep 3 + +# Verify ResourceClaim still exists (should persist after pod deletion) +if oc get resourceclaim ${CLAIM_NAME} -n ${TEST_NAMESPACE} &>/dev/null; then + test_passed "ResourceClaim lifecycle validated" +else + test_failed "ResourceClaim was unexpectedly deleted with pod" +fi + +############################################################################### +# Test 10: Multi-GPU Test (if 2+ GPUs available) +############################################################################### +test_start "Multi-GPU test (if 2+ GPUs available)" + +# Count total GPUs via ResourceSlices (matches gpu_validator.go GetTotalGPUCount) +GPU_COUNT=$(oc get resourceslices -o json 2>/dev/null | \ + jq -r '[.items[] | select(.spec.driver=="gpu.nvidia.com") | .spec.devices | length] | add // 0' 2>/dev/null || echo "0") + +if [ "$GPU_COUNT" -ge 2 ]; then + log_info "Found $GPU_COUNT GPUs, testing multi-GPU allocation..." + test_passed "Multi-GPU test would run (skipped in standalone mode for simplicity)" +else + log_info "Only $GPU_COUNT GPU(s) available - skipping multi-GPU test" + test_passed "Multi-GPU test skipped (insufficient GPUs)" +fi + +############################################################################### +# Final Results +############################################################################### +echo "" +echo "======================================" +echo "Test Results Summary" +echo "======================================" +echo "Tests Run: $TESTS_RUN" +echo "Tests Passed: $TESTS_PASSED" +echo "Tests Failed: $TESTS_FAILED" +echo "" + +if [ $TESTS_FAILED -gt 0 ]; then + echo "Failed Tests:" + for failed_test in "${FAILED_TESTS[@]}"; do + echo " - $failed_test" + done + echo "" + echo "Result: FAILED ✗" + exit 1 +else + echo "Result: ALL TESTS PASSED ✓" + exit 0 +fi