Skip to content

Commit 93efd7f

Browse files
committed
Add GPU discovery utilities for process-to-GPU mapping in hardware fault injection tests
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>
1 parent fb4432e commit 93efd7f

File tree

2 files changed

+287
-0
lines changed

2 files changed

+287
-0
lines changed

tests/fault_tolerance/hardware/fault-injection-service/helpers/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,26 @@
99
"""
1010

1111
__all__ = [
12+
# GPU discovery utilities
13+
"get_available_gpu_ids",
14+
"get_gpu_id_for_process",
15+
"get_gpu_pci_address",
16+
"get_gpu_info",
17+
"get_processes_on_gpu",
18+
# Inference testing utilities
1219
"InferenceLoadTester",
1320
"get_inference_endpoint",
21+
# Kubernetes operations utilities
1422
"NodeOperations",
1523
"PodOperations",
1624
]
1725

26+
from .gpu_discovery import (
27+
get_available_gpu_ids,
28+
get_gpu_id_for_process,
29+
get_gpu_info,
30+
get_gpu_pci_address,
31+
get_processes_on_gpu,
32+
)
1833
from .inference_testing import InferenceLoadTester, get_inference_endpoint
1934
from .k8s_operations import NodeOperations, PodOperations
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
"""
6+
GPU discovery utilities for fault tolerance testing.
7+
8+
Provides functions to discover GPU information from Kubernetes pods,
9+
including mapping processes to GPUs and handling CUDA_VISIBLE_DEVICES remapping.
10+
"""
11+
12+
import logging
13+
from typing import List, Optional
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
def get_available_gpu_ids(pod) -> List[int]:
19+
"""
20+
Get list of actual GPU IDs available in the pod.
21+
22+
Handles non-sequential GPU IDs correctly (e.g., [0, 1, 3, 7] with gaps).
23+
24+
Args:
25+
pod: Kubernetes pod object (kr8s pod with exec() method)
26+
27+
Returns:
28+
List of GPU IDs (e.g., [0, 1, 2, 3]) or empty list if no GPUs found
29+
30+
Example:
31+
>>> gpu_ids = get_available_gpu_ids(pod)
32+
>>> print(gpu_ids)
33+
[0, 1, 2, 3]
34+
"""
35+
try:
36+
result = pod.exec([
37+
"nvidia-smi",
38+
"--query-gpu=index",
39+
"--format=csv,noheader"
40+
])
41+
42+
# Parse GPU indices from output
43+
gpu_ids = []
44+
for line in result.stdout.decode().splitlines():
45+
line = line.strip()
46+
if line.isdigit():
47+
gpu_ids.append(int(line))
48+
49+
if not gpu_ids:
50+
logger.warning(f"No GPUs found in pod {pod.name}")
51+
return []
52+
53+
logger.debug(f"Available GPU IDs in pod {pod.name}: {gpu_ids}")
54+
return gpu_ids
55+
56+
except Exception as e:
57+
logger.error(f"Failed to get GPU IDs from pod {pod.name}: {e}")
58+
return []
59+
60+
61+
def get_gpu_id_for_process(pod, process_pid: int) -> int:
62+
"""
63+
Find which GPU a process is using.
64+
65+
Queries nvidia-smi to determine the primary GPU for a given process.
66+
This correctly handles:
67+
- Non-sequential GPU IDs
68+
- CUDA_VISIBLE_DEVICES remapping
69+
- Multi-GPU processes (returns primary GPU)
70+
71+
Args:
72+
pod: Kubernetes pod object (kr8s pod with exec() method)
73+
process_pid: Process ID to find GPU for
74+
75+
Returns:
76+
GPU ID (0-N) where the process is running, or 0 if not found
77+
78+
Example:
79+
>>> gpu_id = get_gpu_id_for_process(pod, 603)
80+
>>> print(gpu_id)
81+
1 # Process 603 is running on GPU 1
82+
"""
83+
try:
84+
# Get actual GPU IDs available in pod (handles non-sequential IDs)
85+
gpu_ids = get_available_gpu_ids(pod)
86+
87+
if not gpu_ids:
88+
logger.error(f"No GPUs found in pod {pod.name}!")
89+
return 0
90+
91+
logger.debug(
92+
f"Searching for PID {process_pid} across {len(gpu_ids)} GPUs: {gpu_ids}"
93+
)
94+
95+
# Check each GPU for our process
96+
for gpu_id in gpu_ids:
97+
result = pod.exec([
98+
"nvidia-smi", "-i", str(gpu_id),
99+
"--query-compute-apps=pid",
100+
"--format=csv,noheader"
101+
])
102+
103+
# Parse PIDs running on this GPU
104+
pids_output = result.stdout.decode().strip()
105+
106+
# Handle both single PID and multiple PIDs
107+
# Output can be:
108+
# "602" (single PID)
109+
# "602\n603\n604" (multiple PIDs)
110+
# " 602 " (with spaces)
111+
pids_on_gpu = [
112+
p.strip()
113+
for p in pids_output.split('\n')
114+
if p.strip()
115+
]
116+
117+
# Check if our PID is in the list
118+
if str(process_pid) in pids_on_gpu:
119+
logger.info(
120+
f"PID {process_pid} found on GPU {gpu_id} in pod {pod.name}"
121+
)
122+
return gpu_id
123+
124+
# Process not found on any GPU
125+
logger.warning(
126+
f"PID {process_pid} not found on any GPU in pod {pod.name}. "
127+
f"This may happen if the process hasn't initialized CUDA yet or "
128+
f"if nvidia-smi doesn't track multi-process CUDA apps. "
129+
f"Defaulting to first GPU: {gpu_ids[0]}"
130+
)
131+
return gpu_ids[0]
132+
133+
except Exception as e:
134+
logger.error(f"GPU discovery failed for PID {process_pid} in pod {pod.name}: {e}")
135+
return 0
136+
137+
138+
def get_gpu_pci_address(pod, gpu_id: int) -> Optional[str]:
139+
"""
140+
Get PCI bus address for a GPU.
141+
142+
The PCI address is used in kernel XID messages and identifies
143+
the physical hardware location of the GPU.
144+
145+
Args:
146+
pod: Kubernetes pod object
147+
gpu_id: GPU index (0-N) as shown by nvidia-smi
148+
149+
Returns:
150+
PCI address (e.g., "00000000:8D:00.0") or None if failed
151+
152+
Example:
153+
>>> pci_addr = get_gpu_pci_address(pod, 1)
154+
>>> print(pci_addr)
155+
00000000:91:00.0
156+
"""
157+
try:
158+
result = pod.exec([
159+
"nvidia-smi",
160+
"-i", str(gpu_id),
161+
"--query-gpu=pci.bus_id",
162+
"--format=csv,noheader"
163+
])
164+
165+
pci_addr = result.stdout.decode().strip()
166+
167+
if not pci_addr:
168+
logger.error(f"Empty PCI address for GPU {gpu_id}")
169+
return None
170+
171+
logger.debug(f"GPU {gpu_id} in pod {pod.name} has PCI address: {pci_addr}")
172+
return pci_addr
173+
174+
except Exception as e:
175+
logger.error(f"Failed to get PCI address for GPU {gpu_id} in pod {pod.name}: {e}")
176+
return None
177+
178+
179+
def get_gpu_info(pod, gpu_id: int) -> Optional[dict]:
180+
"""
181+
Get comprehensive information about a GPU.
182+
183+
Args:
184+
pod: Kubernetes pod object
185+
gpu_id: GPU index (0-N)
186+
187+
Returns:
188+
Dict with keys: index, name, pci_bus_id, memory_total, driver_version
189+
or None if failed
190+
191+
Example:
192+
>>> info = get_gpu_info(pod, 0)
193+
>>> print(info)
194+
{
195+
'index': 0,
196+
'name': 'NVIDIA H200',
197+
'pci_bus_id': '00000000:8D:00.0',
198+
'memory_total': '143771 MiB',
199+
'driver_version': '550.163.01'
200+
}
201+
"""
202+
try:
203+
result = pod.exec([
204+
"nvidia-smi",
205+
"-i", str(gpu_id),
206+
"--query-gpu=index,name,pci.bus_id,memory.total,driver_version",
207+
"--format=csv,noheader"
208+
])
209+
210+
output = result.stdout.decode().strip()
211+
parts = [p.strip() for p in output.split(',')]
212+
213+
if len(parts) < 5:
214+
logger.error(f"Unexpected nvidia-smi output format: {output}")
215+
return None
216+
217+
return {
218+
'index': int(parts[0]),
219+
'name': parts[1],
220+
'pci_bus_id': parts[2],
221+
'memory_total': parts[3],
222+
'driver_version': parts[4]
223+
}
224+
225+
except Exception as e:
226+
logger.error(f"Failed to get GPU info for GPU {gpu_id}: {e}")
227+
return None
228+
229+
230+
def get_processes_on_gpu(pod, gpu_id: int) -> List[int]:
231+
"""
232+
Get list of process IDs running on a specific GPU.
233+
234+
Args:
235+
pod: Kubernetes pod object
236+
gpu_id: GPU index (0-N)
237+
238+
Returns:
239+
List of PIDs running on this GPU, or empty list if none/error
240+
241+
Example:
242+
>>> pids = get_processes_on_gpu(pod, 1)
243+
>>> print(pids)
244+
[602, 603]
245+
"""
246+
try:
247+
result = pod.exec([
248+
"nvidia-smi", "-i", str(gpu_id),
249+
"--query-compute-apps=pid",
250+
"--format=csv,noheader"
251+
])
252+
253+
pids_output = result.stdout.decode().strip()
254+
255+
if not pids_output:
256+
logger.debug(f"No processes found on GPU {gpu_id} in pod {pod.name}")
257+
return []
258+
259+
# Parse PIDs (handle multiple PIDs on same GPU)
260+
pids = []
261+
for line in pids_output.split('\n'):
262+
line = line.strip()
263+
if line.isdigit():
264+
pids.append(int(line))
265+
266+
logger.debug(f"GPU {gpu_id} in pod {pod.name} has processes: {pids}")
267+
return pids
268+
269+
except Exception as e:
270+
logger.error(f"Failed to get processes for GPU {gpu_id}: {e}")
271+
return []
272+

0 commit comments

Comments
 (0)