1818def get_available_gpu_ids (pod ) -> List [int ]:
1919 """
2020 Get list of actual GPU IDs available in the pod.
21-
21+
2222 Handles non-sequential GPU IDs correctly (e.g., [0, 1, 3, 7] with gaps).
23-
23+
2424 Args:
2525 pod: Kubernetes pod object (kr8s pod with exec() method)
26-
26+
2727 Returns:
2828 List of GPU IDs (e.g., [0, 1, 2, 3]) or empty list if no GPUs found
29-
29+
3030 Example:
3131 >>> gpu_ids = get_available_gpu_ids(pod)
3232 >>> print(gpu_ids)
3333 [0, 1, 2, 3]
3434 """
3535 try :
36- result = pod .exec ([
37- "nvidia-smi" ,
38- "--query-gpu=index" ,
39- "--format=csv,noheader"
40- ])
41-
36+ result = pod .exec (["nvidia-smi" , "--query-gpu=index" , "--format=csv,noheader" ])
37+
4238 # Parse GPU indices from output
4339 gpu_ids = []
4440 for line in result .stdout .decode ().splitlines ():
4541 line = line .strip ()
4642 if line .isdigit ():
4743 gpu_ids .append (int (line ))
48-
44+
4945 if not gpu_ids :
5046 logger .warning (f"No GPUs found in pod { pod .name } " )
5147 return []
52-
48+
5349 logger .debug (f"Available GPU IDs in pod { pod .name } : { gpu_ids } " )
5450 return gpu_ids
55-
51+
5652 except Exception as e :
5753 logger .error (f"Failed to get GPU IDs from pod { pod .name } : { e } " )
5854 return []
@@ -61,20 +57,20 @@ def get_available_gpu_ids(pod) -> List[int]:
6157def get_gpu_id_for_process (pod , process_pid : int ) -> int :
6258 """
6359 Find which GPU a process is using.
64-
60+
6561 Queries nvidia-smi to determine the primary GPU for a given process.
6662 This correctly handles:
6763 - Non-sequential GPU IDs
6864 - CUDA_VISIBLE_DEVICES remapping
6965 - Multi-GPU processes (returns primary GPU)
70-
66+
7167 Args:
7268 pod: Kubernetes pod object (kr8s pod with exec() method)
7369 process_pid: Process ID to find GPU for
74-
70+
7571 Returns:
7672 GPU ID (0-N) where the process is running, or 0 if not found
77-
73+
7874 Example:
7975 >>> gpu_id = get_gpu_id_for_process(pod, 603)
8076 >>> print(gpu_id)
@@ -83,44 +79,44 @@ def get_gpu_id_for_process(pod, process_pid: int) -> int:
8379 try :
8480 # Get actual GPU IDs available in pod (handles non-sequential IDs)
8581 gpu_ids = get_available_gpu_ids (pod )
86-
82+
8783 if not gpu_ids :
8884 logger .error (f"No GPUs found in pod { pod .name } !" )
8985 return 0
90-
86+
9187 logger .debug (
9288 f"Searching for PID { process_pid } across { len (gpu_ids )} GPUs: { gpu_ids } "
9389 )
94-
90+
9591 # Check each GPU for our process
9692 for gpu_id in gpu_ids :
97- result = pod .exec ([
98- "nvidia-smi" , "-i" , str (gpu_id ),
99- "--query-compute-apps=pid" ,
100- "--format=csv,noheader"
101- ])
102-
93+ result = pod .exec (
94+ [
95+ "nvidia-smi" ,
96+ "-i" ,
97+ str (gpu_id ),
98+ "--query-compute-apps=pid" ,
99+ "--format=csv,noheader" ,
100+ ]
101+ )
102+
103103 # Parse PIDs running on this GPU
104104 pids_output = result .stdout .decode ().strip ()
105-
105+
106106 # Handle both single PID and multiple PIDs
107107 # Output can be:
108108 # "602" (single PID)
109109 # "602\n603\n604" (multiple PIDs)
110110 # " 602 " (with spaces)
111- pids_on_gpu = [
112- p .strip ()
113- for p in pids_output .split ('\n ' )
114- if p .strip ()
115- ]
116-
111+ pids_on_gpu = [p .strip () for p in pids_output .split ("\n " ) if p .strip ()]
112+
117113 # Check if our PID is in the list
118114 if str (process_pid ) in pids_on_gpu :
119115 logger .info (
120116 f"PID { process_pid } found on GPU { gpu_id } in pod { pod .name } "
121117 )
122118 return gpu_id
123-
119+
124120 # Process not found on any GPU
125121 logger .warning (
126122 f"PID { process_pid } not found on any GPU in pod { pod .name } . "
@@ -129,65 +125,72 @@ def get_gpu_id_for_process(pod, process_pid: int) -> int:
129125 f"Defaulting to first GPU: { gpu_ids [0 ]} "
130126 )
131127 return gpu_ids [0 ]
132-
128+
133129 except Exception as e :
134- logger .error (f"GPU discovery failed for PID { process_pid } in pod { pod .name } : { e } " )
130+ logger .error (
131+ f"GPU discovery failed for PID { process_pid } in pod { pod .name } : { e } "
132+ )
135133 return 0
136134
137135
138136def get_gpu_pci_address (pod , gpu_id : int ) -> Optional [str ]:
139137 """
140138 Get PCI bus address for a GPU.
141-
139+
142140 The PCI address is used in kernel XID messages and identifies
143141 the physical hardware location of the GPU.
144-
142+
145143 Args:
146144 pod: Kubernetes pod object
147145 gpu_id: GPU index (0-N) as shown by nvidia-smi
148-
146+
149147 Returns:
150148 PCI address (e.g., "00000000:8D:00.0") or None if failed
151-
149+
152150 Example:
153151 >>> pci_addr = get_gpu_pci_address(pod, 1)
154152 >>> print(pci_addr)
155153 00000000:91:00.0
156154 """
157155 try :
158- result = pod .exec ([
159- "nvidia-smi" ,
160- "-i" , str (gpu_id ),
161- "--query-gpu=pci.bus_id" ,
162- "--format=csv,noheader"
163- ])
164-
156+ result = pod .exec (
157+ [
158+ "nvidia-smi" ,
159+ "-i" ,
160+ str (gpu_id ),
161+ "--query-gpu=pci.bus_id" ,
162+ "--format=csv,noheader" ,
163+ ]
164+ )
165+
165166 pci_addr = result .stdout .decode ().strip ()
166-
167+
167168 if not pci_addr :
168169 logger .error (f"Empty PCI address for GPU { gpu_id } " )
169170 return None
170-
171+
171172 logger .debug (f"GPU { gpu_id } in pod { pod .name } has PCI address: { pci_addr } " )
172173 return pci_addr
173-
174+
174175 except Exception as e :
175- logger .error (f"Failed to get PCI address for GPU { gpu_id } in pod { pod .name } : { e } " )
176+ logger .error (
177+ f"Failed to get PCI address for GPU { gpu_id } in pod { pod .name } : { e } "
178+ )
176179 return None
177180
178181
179182def get_gpu_info (pod , gpu_id : int ) -> Optional [dict ]:
180183 """
181184 Get comprehensive information about a GPU.
182-
185+
183186 Args:
184187 pod: Kubernetes pod object
185188 gpu_id: GPU index (0-N)
186-
189+
187190 Returns:
188191 Dict with keys: index, name, pci_bus_id, memory_total, driver_version
189192 or None if failed
190-
193+
191194 Example:
192195 >>> info = get_gpu_info(pod, 0)
193196 >>> print(info)
@@ -200,28 +203,31 @@ def get_gpu_info(pod, gpu_id: int) -> Optional[dict]:
200203 }
201204 """
202205 try :
203- result = pod .exec ([
204- "nvidia-smi" ,
205- "-i" , str (gpu_id ),
206- "--query-gpu=index,name,pci.bus_id,memory.total,driver_version" ,
207- "--format=csv,noheader"
208- ])
209-
206+ result = pod .exec (
207+ [
208+ "nvidia-smi" ,
209+ "-i" ,
210+ str (gpu_id ),
211+ "--query-gpu=index,name,pci.bus_id,memory.total,driver_version" ,
212+ "--format=csv,noheader" ,
213+ ]
214+ )
215+
210216 output = result .stdout .decode ().strip ()
211- parts = [p .strip () for p in output .split (',' )]
212-
217+ parts = [p .strip () for p in output .split ("," )]
218+
213219 if len (parts ) < 5 :
214220 logger .error (f"Unexpected nvidia-smi output format: { output } " )
215221 return None
216-
222+
217223 return {
218- ' index' : int (parts [0 ]),
219- ' name' : parts [1 ],
220- ' pci_bus_id' : parts [2 ],
221- ' memory_total' : parts [3 ],
222- ' driver_version' : parts [4 ]
224+ " index" : int (parts [0 ]),
225+ " name" : parts [1 ],
226+ " pci_bus_id" : parts [2 ],
227+ " memory_total" : parts [3 ],
228+ " driver_version" : parts [4 ],
223229 }
224-
230+
225231 except Exception as e :
226232 logger .error (f"Failed to get GPU info for GPU { gpu_id } : { e } " )
227233 return None
@@ -230,43 +236,46 @@ def get_gpu_info(pod, gpu_id: int) -> Optional[dict]:
230236def get_processes_on_gpu (pod , gpu_id : int ) -> List [int ]:
231237 """
232238 Get list of process IDs running on a specific GPU.
233-
239+
234240 Args:
235241 pod: Kubernetes pod object
236242 gpu_id: GPU index (0-N)
237-
243+
238244 Returns:
239245 List of PIDs running on this GPU, or empty list if none/error
240-
246+
241247 Example:
242248 >>> pids = get_processes_on_gpu(pod, 1)
243249 >>> print(pids)
244250 [602, 603]
245251 """
246252 try :
247- result = pod .exec ([
248- "nvidia-smi" , "-i" , str (gpu_id ),
249- "--query-compute-apps=pid" ,
250- "--format=csv,noheader"
251- ])
252-
253+ result = pod .exec (
254+ [
255+ "nvidia-smi" ,
256+ "-i" ,
257+ str (gpu_id ),
258+ "--query-compute-apps=pid" ,
259+ "--format=csv,noheader" ,
260+ ]
261+ )
262+
253263 pids_output = result .stdout .decode ().strip ()
254-
264+
255265 if not pids_output :
256266 logger .debug (f"No processes found on GPU { gpu_id } in pod { pod .name } " )
257267 return []
258-
268+
259269 # Parse PIDs (handle multiple PIDs on same GPU)
260270 pids = []
261- for line in pids_output .split (' \n ' ):
271+ for line in pids_output .split (" \n " ):
262272 line = line .strip ()
263273 if line .isdigit ():
264274 pids .append (int (line ))
265-
275+
266276 logger .debug (f"GPU { gpu_id } in pod { pod .name } has processes: { pids } " )
267277 return pids
268-
278+
269279 except Exception as e :
270280 logger .error (f"Failed to get processes for GPU { gpu_id } : { e } " )
271281 return []
272-
0 commit comments