Skip to content

Commit b97660d

Browse files
committed
[XPU] support pytthon memory api in XPU
1 parent 81ef31d commit b97660d

File tree

4 files changed

+333
-7
lines changed

4 files changed

+333
-7
lines changed

paddle/fluid/pybind/pybind.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2917,6 +2917,7 @@ All parameter, weight, gradient are variables in Paddle.
29172917

29182918
#ifdef PADDLE_WITH_XPU
29192919
m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
2920+
m.def("get_xpu_current_device_id", &platform::GetXPUCurrentDeviceId);
29202921
m.def("xpu_empty_cache", platform::EmptyCache);
29212922
m.def("get_xpu_device_utilization_rate",
29222923
platform::GetXPUDeviceUtilizationRate);

paddle/phi/backends/xpu/xpu_info.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ int GetXPUDeviceTotalMemory(int dev_id) {
249249
xpumlMemory_t dev_mem_info;
250250
PADDLE_ENFORCE_XPUML_SUCCESS(
251251
xpumlDeviceGetMemoryInfo(dev_handle, &dev_mem_info));
252-
return dev_mem_info.totalGlobalMemory / 1024 / 1024; // MB
252+
return dev_mem_info.totalGlobalMemory; // with Byte
253253
}
254254

255255
int GetXPUDeviceUsedMemory(int dev_id) {
@@ -264,7 +264,7 @@ int GetXPUDeviceUsedMemory(int dev_id) {
264264
xpumlMemory_t dev_mem_info;
265265
PADDLE_ENFORCE_XPUML_SUCCESS(
266266
xpumlDeviceGetMemoryInfo(dev_handle, &dev_mem_info));
267-
return dev_mem_info.usedGlobalMemory / 1024 / 1024; // MB
267+
return dev_mem_info.usedGlobalMemory; // with Byte
268268
}
269269

270270
XPUVersion get_xpu_version(int dev_id) {

python/paddle/device/xpu/__init__.py

Lines changed: 324 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,66 @@
2525

2626
_XPUPlaceLike: TypeAlias = Union[
2727
XPUPlace,
28+
str, # some str like 'xpu:0', 'xpu:1', etc.
2829
int, # some int like 0, 1, etc.
2930
]
30-
__all__ = ['synchronize', 'empty_cache']
31+
32+
__all__ = [
33+
'synchronize',
34+
'device_count',
35+
'set_debug_level',
36+
'empty_cache',
37+
'max_memory_allocated',
38+
'max_memory_reserved',
39+
'reset_max_memory_allocated',
40+
'reset_max_memory_reserved',
41+
'memory_allocated',
42+
'memory_reserved',
43+
'memory_total', # memory maneged by runtime, not paddle
44+
'memory_used', # memory maneged by runtime, not paddle
45+
]
46+
47+
48+
def extract_xpu_device_id(device: _XPUPlaceLike, op_name: str) -> int:
49+
'''
50+
Return the id of the given xpu device. It is just a utility that will not be exposed to users.
51+
52+
Args:
53+
device(paddle.XPUPlace or int or str): The device, the id of the device or
54+
the string name of device like 'xpu:x'.
55+
Default: None.
56+
57+
Return:
58+
int: The id of the given device. If device is None, return the id of current device.
59+
'''
60+
if device is None:
61+
return core.get_xpu_current_device_id()
62+
63+
if isinstance(device, int):
64+
device_id = device
65+
elif isinstance(device, core.XPUPlace):
66+
device_id = device.get_device_id()
67+
elif isinstance(device, str):
68+
if device.startswith('xpu:'):
69+
device_id = int(device[4:])
70+
else:
71+
raise ValueError(
72+
f"The current string {device} is not expected. Because {op_name} only support string which is like 'xpu:x'. "
73+
"Please input appropriate string again!"
74+
)
75+
else:
76+
raise ValueError(
77+
f"The device type {device} is not expected. Because {op_name} only support int, str or paddle.XPUPlace. "
78+
"Please input appropriate device again!"
79+
)
80+
81+
assert (
82+
device_id >= 0
83+
), f"The device id must be not less than 0, but got id = {device_id}."
84+
assert (
85+
device_id < device_count()
86+
), f"The device id {device_id} exceeds xpu card number {device_count()}"
87+
return device_id
3188

3289

3390
@deprecated(
@@ -135,6 +192,270 @@ def empty_cache() -> None:
135192
>>> del tensor
136193
>>> paddle.device.xpu.empty_cache()
137194
'''
138-
139-
if core.is_compiled_with_xpu():
195+
name = "paddle.device.xpu.empty_cache"
196+
if not core.is_compiled_with_xpu():
197+
raise ValueError(
198+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
199+
)
200+
else:
140201
core.xpu_empty_cache()
202+
203+
204+
def max_memory_allocated(device: _XPUPlaceLike | None = None) -> int:
205+
'''
206+
Return the peak size of xpu memory that is allocated to tensor of the given device.
207+
208+
Note:
209+
The size of XPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
210+
For instance, a float32 0-D Tensor with shape [] in XPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
211+
212+
Args:
213+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
214+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
215+
Default: None.
216+
217+
Return:
218+
int: The peak size of xpu memory that is allocated to tensor of the given device, in bytes.
219+
220+
Examples:
221+
.. code-block:: python
222+
223+
>>> # doctest: +REQUIRES(env:XPU)
224+
>>> import paddle
225+
>>> paddle.device.set_device('xpu')
226+
227+
>>> max_memory_allocated_size = paddle.device.xpu.max_memory_allocated(paddle.XPUPlace(0))
228+
>>> max_memory_allocated_size = paddle.device.xpu.max_memory_allocated(0)
229+
>>> max_memory_allocated_size = paddle.device.xpu.max_memory_allocated("xpu:0")
230+
'''
231+
name = "paddle.device.xpu.max_memory_allocated"
232+
if not core.is_compiled_with_xpu():
233+
raise ValueError(
234+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
235+
)
236+
device_id = extract_xpu_device_id(device, op_name=name)
237+
return core.device_memory_stat_peak_value("Allocated", device_id)
238+
239+
240+
def max_memory_reserved(device: _XPUPlaceLike | None = None) -> int:
241+
'''
242+
Return the peak size of XPU memory that is held by the allocator of the given device.
243+
244+
Args:
245+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
246+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
247+
Default: None.
248+
249+
Return:
250+
int: The peak size of XPU memory that is held by the allocator of the given device, in bytes.
251+
252+
Examples:
253+
.. code-block:: python
254+
255+
>>> # doctest: +REQUIRES(env:XPU)
256+
>>> import paddle
257+
>>> paddle.device.set_device('xpu')
258+
259+
>>> max_memory_reserved_size = paddle.device.xpu.max_memory_reserved(paddle.XPUPlace(0))
260+
>>> max_memory_reserved_size = paddle.device.xpu.max_memory_reserved(0)
261+
>>> max_memory_reserved_size = paddle.device.xpu.max_memory_reserved("xpu:0")
262+
'''
263+
name = "paddle.device.xpu.max_memory_reserved"
264+
if not core.is_compiled_with_xpu():
265+
raise ValueError(
266+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
267+
)
268+
device_id = extract_xpu_device_id(device, op_name=name)
269+
return core.device_memory_stat_peak_value("Reserved", device_id)
270+
271+
272+
def reset_max_memory_allocated(device: _XPUPlaceLike | None = None) -> None:
273+
'''
274+
Reset the peak size of XPU memory that is allocated to tensor of the given device.
275+
276+
Args:
277+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
278+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
279+
Default: None.
280+
281+
Examples:
282+
.. code-block:: python
283+
284+
>>> # doctest: +REQUIRES(env:XPU)
285+
>>> import paddle
286+
>>> paddle.device.set_device('xpu')
287+
288+
>>> paddle.device.xpu.reset_max_memory_allocated(paddle.XPUPlace(0))
289+
>>> paddle.device.xpu.reset_max_memory_allocated(0)
290+
>>> paddle.device.xpu.reset_max_memory_allocated("xpu:0")
291+
'''
292+
293+
name = "paddle.device.xpu.reset_max_memory_allocated"
294+
if not core.is_compiled_with_xpu():
295+
raise ValueError(
296+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
297+
)
298+
device_id = extract_xpu_device_id(device, op_name=name)
299+
core.device_memory_stat_reset_peak_value("Allocated", device_id)
300+
301+
302+
def reset_max_memory_reserved(device: _XPUPlaceLike | None = None) -> None:
303+
'''
304+
Reset the peak size of XPU memory that is held by the allocator of the given device.
305+
306+
Args:
307+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
308+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
309+
Default: None.
310+
311+
Examples:
312+
.. code-block:: python
313+
314+
>>> # doctest: +REQUIRES(env:XPU)
315+
>>> import paddle
316+
>>> paddle.device.set_device('xpu')
317+
318+
>>> paddle.device.xpu.reset_max_memory_reserved(paddle.XPUPlace(0))
319+
>>> paddle.device.xpu.reset_max_memory_reserved(0)
320+
>>> paddle.device.xpu.reset_max_memory_reserved("xpu:0")
321+
'''
322+
323+
name = "paddle.device.xpu.reset_max_memory_reserved"
324+
if not core.is_compiled_with_xpu():
325+
raise ValueError(
326+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
327+
)
328+
device_id = extract_xpu_device_id(device, op_name=name)
329+
core.device_memory_stat_reset_peak_value("Reserved", device_id)
330+
331+
332+
def memory_allocated(device: _XPUPlaceLike | None = None) -> int:
333+
'''
334+
Return the current size of xpu memory that is allocated to tensor of the given device.
335+
336+
Note:
337+
The size of XPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
338+
For instance, a float32 0-D Tensor with shape [] in XPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
339+
340+
Args:
341+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
342+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
343+
Default: None.
344+
345+
Return:
346+
int: The current size of xpu memory that is allocated to tensor of the given device, in bytes.
347+
348+
Examples:
349+
.. code-block:: python
350+
351+
>>> # doctest: +REQUIRES(env:XPU)
352+
>>> import paddle
353+
>>> paddle.device.set_device('xpu')
354+
355+
>>> memory_allocated_size = paddle.device.xpu.memory_allocated(paddle.XPUPlace(0))
356+
>>> memory_allocated_size = paddle.device.xpu.memory_allocated(0)
357+
>>> memory_allocated_size = paddle.device.xpu.memory_allocated("xpu:0")
358+
'''
359+
name = "paddle.device.xpu.memory_allocated"
360+
if not core.is_compiled_with_xpu():
361+
raise ValueError(
362+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
363+
)
364+
device_id = extract_xpu_device_id(device, op_name=name)
365+
return core.device_memory_stat_current_value("Allocated", device_id)
366+
367+
368+
def memory_reserved(device: _XPUPlaceLike | None = None) -> int:
369+
'''
370+
Return the current size of XPU memory that is held by the allocator of the given device.
371+
372+
Args:
373+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
374+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
375+
Default: None.
376+
377+
Return:
378+
int: The current size of XPU memory that is held by the allocator of the given device, in bytes.
379+
380+
Examples:
381+
.. code-block:: python
382+
383+
>>> # doctest: +REQUIRES(env:XPU)
384+
>>> import paddle
385+
>>> paddle.device.set_device('xpu')
386+
387+
>>> memory_reserved_size = paddle.device.xpu.memory_reserved(paddle.XPUPlace(0))
388+
>>> memory_reserved_size = paddle.device.xpu.memory_reserved(0)
389+
>>> memory_reserved_size = paddle.device.xpu.memory_reserved("xpu:0")
390+
'''
391+
name = "paddle.device.xpu.memory_reserved"
392+
if not core.is_compiled_with_xpu():
393+
raise ValueError(
394+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
395+
)
396+
device_id = extract_xpu_device_id(device, op_name=name)
397+
return core.device_memory_stat_current_value("Reserved", device_id)
398+
399+
400+
def memory_total(device: _XPUPlaceLike | None = None) -> int:
401+
'''
402+
Return the total size of XPU memory of the given device that is held by the XPU Runtime.
403+
404+
Args:
405+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
406+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
407+
Default: None.
408+
409+
Return:
410+
int: The total size of XPU memory of the given device that is held by the XPU Runtime, in bytes.
411+
412+
Examples:
413+
.. code-block:: python
414+
415+
>>> # doctest: +REQUIRES(env:XPU)
416+
>>> import paddle
417+
>>> paddle.device.set_device('xpu')
418+
419+
>>> memory_total_size = paddle.device.xpu.memory_total(paddle.XPUPlace(0))
420+
>>> memory_total_size = paddle.device.xpu.memory_total(0)
421+
>>> memory_total_size = paddle.device.xpu.memory_total("xpu:0")
422+
'''
423+
name = "paddle.device.xpu.memory_total"
424+
if not core.is_compiled_with_xpu():
425+
raise ValueError(
426+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
427+
)
428+
device_id = extract_xpu_device_id(device, op_name=name)
429+
return core.get_xpu_device_total_memory(device_id)
430+
431+
432+
def memory_used(device: _XPUPlaceLike | None = None) -> int:
433+
'''
434+
Return the used size of XPU memory of the given device that is held by the XPU Runtime.
435+
436+
Args:
437+
device(paddle.XPUPlace|int|str|None, optional): The device, the id of the device or
438+
the string name of device like 'xpu:x'. If device is None, the device is the current device.
439+
Default: None.
440+
441+
Return:
442+
int: The used size of XPU memory of the given device that is held by the XPU Runtime, in bytes.
443+
444+
Examples:
445+
.. code-block:: python
446+
447+
>>> # doctest: +REQUIRES(env:XPU)
448+
>>> import paddle
449+
>>> paddle.device.set_device('xpu')
450+
451+
>>> memory_used_size = paddle.device.xpu.memory_used(paddle.XPUPlace(0))
452+
>>> memory_used_size = paddle.device.xpu.memory_used(0)
453+
>>> memory_used_size = paddle.device.xpu.memory_used("xpu:0")
454+
'''
455+
name = "paddle.device.xpu.memory_used"
456+
if not core.is_compiled_with_xpu():
457+
raise ValueError(
458+
f"The API {name} is only supported in XPU PaddlePaddle. Please reinstall PaddlePaddle with XPU support to call this API."
459+
)
460+
device_id = extract_xpu_device_id(device, op_name=name)
461+
return core.get_xpu_device_used_memory(device_id)

python/paddle/distributed/launch/utils/nvsmi.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,12 @@ def query_xpu_smi(query=None, index=None, dtype=None):
157157

158158
for dev_id in range(core.get_xpu_device_count()):
159159
utilization_xpu = core.get_xpu_device_utilization_rate(dev_id)
160-
mem_total = core.get_xpu_device_total_memory(dev_id)
161-
mem_used = core.get_xpu_device_used_memory(dev_id)
160+
mem_total = (
161+
core.get_xpu_device_total_memory(dev_id) / 1024 / 1024
162+
) # with MB
163+
mem_used = (
164+
core.get_xpu_device_used_memory(dev_id) / 1024 / 1024
165+
) # with MB
162166
result = [
163167
dev_id,
164168
utilization_xpu,

0 commit comments

Comments
 (0)