Skip to content

Commit 9218156

Browse files
committed
format
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>
1 parent 481b5a0 commit 9218156

File tree

1 file changed

+94
-85
lines changed

1 file changed

+94
-85
lines changed

tests/fault_tolerance/hardware/fault_injection_service/helpers/gpu_discovery.py

Lines changed: 94 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -18,41 +18,37 @@
1818
def get_available_gpu_ids(pod) -> List[int]:
1919
"""
2020
Get list of actual GPU IDs available in the pod.
21-
21+
2222
Handles non-sequential GPU IDs correctly (e.g., [0, 1, 3, 7] with gaps).
23-
23+
2424
Args:
2525
pod: Kubernetes pod object (kr8s pod with exec() method)
26-
26+
2727
Returns:
2828
List of GPU IDs (e.g., [0, 1, 2, 3]) or empty list if no GPUs found
29-
29+
3030
Example:
3131
>>> gpu_ids = get_available_gpu_ids(pod)
3232
>>> print(gpu_ids)
3333
[0, 1, 2, 3]
3434
"""
3535
try:
36-
result = pod.exec([
37-
"nvidia-smi",
38-
"--query-gpu=index",
39-
"--format=csv,noheader"
40-
])
41-
36+
result = pod.exec(["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"])
37+
4238
# Parse GPU indices from output
4339
gpu_ids = []
4440
for line in result.stdout.decode().splitlines():
4541
line = line.strip()
4642
if line.isdigit():
4743
gpu_ids.append(int(line))
48-
44+
4945
if not gpu_ids:
5046
logger.warning(f"No GPUs found in pod {pod.name}")
5147
return []
52-
48+
5349
logger.debug(f"Available GPU IDs in pod {pod.name}: {gpu_ids}")
5450
return gpu_ids
55-
51+
5652
except Exception as e:
5753
logger.error(f"Failed to get GPU IDs from pod {pod.name}: {e}")
5854
return []
@@ -61,20 +57,20 @@ def get_available_gpu_ids(pod) -> List[int]:
6157
def get_gpu_id_for_process(pod, process_pid: int) -> int:
6258
"""
6359
Find which GPU a process is using.
64-
60+
6561
Queries nvidia-smi to determine the primary GPU for a given process.
6662
This correctly handles:
6763
- Non-sequential GPU IDs
6864
- CUDA_VISIBLE_DEVICES remapping
6965
- Multi-GPU processes (returns primary GPU)
70-
66+
7167
Args:
7268
pod: Kubernetes pod object (kr8s pod with exec() method)
7369
process_pid: Process ID to find GPU for
74-
70+
7571
Returns:
7672
GPU ID (0-N) where the process is running, or 0 if not found
77-
73+
7874
Example:
7975
>>> gpu_id = get_gpu_id_for_process(pod, 603)
8076
>>> print(gpu_id)
@@ -83,44 +79,44 @@ def get_gpu_id_for_process(pod, process_pid: int) -> int:
8379
try:
8480
# Get actual GPU IDs available in pod (handles non-sequential IDs)
8581
gpu_ids = get_available_gpu_ids(pod)
86-
82+
8783
if not gpu_ids:
8884
logger.error(f"No GPUs found in pod {pod.name}!")
8985
return 0
90-
86+
9187
logger.debug(
9288
f"Searching for PID {process_pid} across {len(gpu_ids)} GPUs: {gpu_ids}"
9389
)
94-
90+
9591
# Check each GPU for our process
9692
for gpu_id in gpu_ids:
97-
result = pod.exec([
98-
"nvidia-smi", "-i", str(gpu_id),
99-
"--query-compute-apps=pid",
100-
"--format=csv,noheader"
101-
])
102-
93+
result = pod.exec(
94+
[
95+
"nvidia-smi",
96+
"-i",
97+
str(gpu_id),
98+
"--query-compute-apps=pid",
99+
"--format=csv,noheader",
100+
]
101+
)
102+
103103
# Parse PIDs running on this GPU
104104
pids_output = result.stdout.decode().strip()
105-
105+
106106
# Handle both single PID and multiple PIDs
107107
# Output can be:
108108
# "602" (single PID)
109109
# "602\n603\n604" (multiple PIDs)
110110
# " 602 " (with spaces)
111-
pids_on_gpu = [
112-
p.strip()
113-
for p in pids_output.split('\n')
114-
if p.strip()
115-
]
116-
111+
pids_on_gpu = [p.strip() for p in pids_output.split("\n") if p.strip()]
112+
117113
# Check if our PID is in the list
118114
if str(process_pid) in pids_on_gpu:
119115
logger.info(
120116
f"PID {process_pid} found on GPU {gpu_id} in pod {pod.name}"
121117
)
122118
return gpu_id
123-
119+
124120
# Process not found on any GPU
125121
logger.warning(
126122
f"PID {process_pid} not found on any GPU in pod {pod.name}. "
@@ -129,65 +125,72 @@ def get_gpu_id_for_process(pod, process_pid: int) -> int:
129125
f"Defaulting to first GPU: {gpu_ids[0]}"
130126
)
131127
return gpu_ids[0]
132-
128+
133129
except Exception as e:
134-
logger.error(f"GPU discovery failed for PID {process_pid} in pod {pod.name}: {e}")
130+
logger.error(
131+
f"GPU discovery failed for PID {process_pid} in pod {pod.name}: {e}"
132+
)
135133
return 0
136134

137135

138136
def get_gpu_pci_address(pod, gpu_id: int) -> Optional[str]:
139137
"""
140138
Get PCI bus address for a GPU.
141-
139+
142140
The PCI address is used in kernel XID messages and identifies
143141
the physical hardware location of the GPU.
144-
142+
145143
Args:
146144
pod: Kubernetes pod object
147145
gpu_id: GPU index (0-N) as shown by nvidia-smi
148-
146+
149147
Returns:
150148
PCI address (e.g., "00000000:8D:00.0") or None if failed
151-
149+
152150
Example:
153151
>>> pci_addr = get_gpu_pci_address(pod, 1)
154152
>>> print(pci_addr)
155153
00000000:91:00.0
156154
"""
157155
try:
158-
result = pod.exec([
159-
"nvidia-smi",
160-
"-i", str(gpu_id),
161-
"--query-gpu=pci.bus_id",
162-
"--format=csv,noheader"
163-
])
164-
156+
result = pod.exec(
157+
[
158+
"nvidia-smi",
159+
"-i",
160+
str(gpu_id),
161+
"--query-gpu=pci.bus_id",
162+
"--format=csv,noheader",
163+
]
164+
)
165+
165166
pci_addr = result.stdout.decode().strip()
166-
167+
167168
if not pci_addr:
168169
logger.error(f"Empty PCI address for GPU {gpu_id}")
169170
return None
170-
171+
171172
logger.debug(f"GPU {gpu_id} in pod {pod.name} has PCI address: {pci_addr}")
172173
return pci_addr
173-
174+
174175
except Exception as e:
175-
logger.error(f"Failed to get PCI address for GPU {gpu_id} in pod {pod.name}: {e}")
176+
logger.error(
177+
f"Failed to get PCI address for GPU {gpu_id} in pod {pod.name}: {e}"
178+
)
176179
return None
177180

178181

179182
def get_gpu_info(pod, gpu_id: int) -> Optional[dict]:
180183
"""
181184
Get comprehensive information about a GPU.
182-
185+
183186
Args:
184187
pod: Kubernetes pod object
185188
gpu_id: GPU index (0-N)
186-
189+
187190
Returns:
188191
Dict with keys: index, name, pci_bus_id, memory_total, driver_version
189192
or None if failed
190-
193+
191194
Example:
192195
>>> info = get_gpu_info(pod, 0)
193196
>>> print(info)
@@ -200,28 +203,31 @@ def get_gpu_info(pod, gpu_id: int) -> Optional[dict]:
200203
}
201204
"""
202205
try:
203-
result = pod.exec([
204-
"nvidia-smi",
205-
"-i", str(gpu_id),
206-
"--query-gpu=index,name,pci.bus_id,memory.total,driver_version",
207-
"--format=csv,noheader"
208-
])
209-
206+
result = pod.exec(
207+
[
208+
"nvidia-smi",
209+
"-i",
210+
str(gpu_id),
211+
"--query-gpu=index,name,pci.bus_id,memory.total,driver_version",
212+
"--format=csv,noheader",
213+
]
214+
)
215+
210216
output = result.stdout.decode().strip()
211-
parts = [p.strip() for p in output.split(',')]
212-
217+
parts = [p.strip() for p in output.split(",")]
218+
213219
if len(parts) < 5:
214220
logger.error(f"Unexpected nvidia-smi output format: {output}")
215221
return None
216-
222+
217223
return {
218-
'index': int(parts[0]),
219-
'name': parts[1],
220-
'pci_bus_id': parts[2],
221-
'memory_total': parts[3],
222-
'driver_version': parts[4]
224+
"index": int(parts[0]),
225+
"name": parts[1],
226+
"pci_bus_id": parts[2],
227+
"memory_total": parts[3],
228+
"driver_version": parts[4],
223229
}
224-
230+
225231
except Exception as e:
226232
logger.error(f"Failed to get GPU info for GPU {gpu_id}: {e}")
227233
return None
@@ -230,43 +236,46 @@ def get_gpu_info(pod, gpu_id: int) -> Optional[dict]:
230236
def get_processes_on_gpu(pod, gpu_id: int) -> List[int]:
231237
"""
232238
Get list of process IDs running on a specific GPU.
233-
239+
234240
Args:
235241
pod: Kubernetes pod object
236242
gpu_id: GPU index (0-N)
237-
243+
238244
Returns:
239245
List of PIDs running on this GPU, or empty list if none/error
240-
246+
241247
Example:
242248
>>> pids = get_processes_on_gpu(pod, 1)
243249
>>> print(pids)
244250
[602, 603]
245251
"""
246252
try:
247-
result = pod.exec([
248-
"nvidia-smi", "-i", str(gpu_id),
249-
"--query-compute-apps=pid",
250-
"--format=csv,noheader"
251-
])
252-
253+
result = pod.exec(
254+
[
255+
"nvidia-smi",
256+
"-i",
257+
str(gpu_id),
258+
"--query-compute-apps=pid",
259+
"--format=csv,noheader",
260+
]
261+
)
262+
253263
pids_output = result.stdout.decode().strip()
254-
264+
255265
if not pids_output:
256266
logger.debug(f"No processes found on GPU {gpu_id} in pod {pod.name}")
257267
return []
258-
268+
259269
# Parse PIDs (handle multiple PIDs on same GPU)
260270
pids = []
261-
for line in pids_output.split('\n'):
271+
for line in pids_output.split("\n"):
262272
line = line.strip()
263273
if line.isdigit():
264274
pids.append(int(line))
265-
275+
266276
logger.debug(f"GPU {gpu_id} in pod {pod.name} has processes: {pids}")
267277
return pids
268-
278+
269279
except Exception as e:
270280
logger.error(f"Failed to get processes for GPU {gpu_id}: {e}")
271281
return []
272-

0 commit comments

Comments
 (0)