From 75146686d0d8ee1a561eb3ff562e300993d819e8 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Sat, 7 Dec 2024 21:34:51 +0800 Subject: [PATCH 01/21] refactor: hardware data model --- swanlab/data/run/exp.py | 64 +++++++++++-------- swanlab/data/run/helper.py | 2 +- swanlab/data/run/main.py | 25 +++++--- swanlab/data/run/metadata/__init__.py | 2 +- .../data/run/metadata/hardware/__init__.py | 13 ++-- swanlab/data/run/metadata/hardware/cpu.py | 4 +- .../data/run/metadata/hardware/gpu/nvidia.py | 4 +- swanlab/data/run/metadata/hardware/memory.py | 4 +- .../data/run/metadata/hardware/npu/ascend.py | 8 +-- .../data/run/metadata/hardware/soc/apple.py | 33 ++++++---- swanlab/data/run/metadata/hardware/type.py | 27 ++++++-- swanlab/data/run/metadata/hardware/utils.py | 25 -------- 12 files changed, 118 insertions(+), 93 deletions(-) delete mode 100644 swanlab/data/run/metadata/hardware/utils.py diff --git a/swanlab/data/run/exp.py b/swanlab/data/run/exp.py index d3369a08..bb273c12 100644 --- a/swanlab/data/run/exp.py +++ b/swanlab/data/run/exp.py @@ -36,8 +36,9 @@ def __init__(self, settings: SwanLabSharedSettings, operator: SwanLabRunOperator def __add( self, key: str, - key_name: Optional[str], - key_class: ColumnClass, + name: Optional[str], + column_class: ColumnClass, + column_config: Optional[ColumnConfig], section_type: SectionType, data: DataWrapper, step: int = None, @@ -48,19 +49,19 @@ def __add( ---------- key : str key的云端唯一标识 - data : DataWrapper - 包装后的数据,用于数据解析 - key_class : str - key的类型,CUSTOM为自定义key,SYSTEM为系统key - key_name : str + name : str key的实际名称 + column_class : str + 列类型,CUSTOM为自定义key,SYSTEM为系统key section_type : str key的组类型 + data : DataWrapper + 包装后的数据,用于数据解析 step : int, optional 步数,如果不传则默认当前步数为'已添加数据数量+1' 在log函数中已经做了处理,此处不需要考虑数值类型等情况 """ - key_index = f"{key_class}-{key}" + key_index = f"{column_class}-{key}" # 判断tag是否存在,如果不存在则创建tag key_obj: SwanLabKey = self.keys.get(key_index, None) @@ -86,7 +87,15 @@ def __add( key_obj = SwanLabKey(key, self.settings) self.keys[key_index] = key_obj # 新建图表,完成数据格式校验 - column_info = key_obj.create_column(key, key_name, key_class, section_type, data, num) + column_info = key_obj.create_column( + key, + name, + column_class, + column_config, + section_type, + data, + num, + ) self.warn_type_error(key_index, key) # 创建新列,生成回调 self.__operator.on_column_create(column_info) @@ -104,11 +113,11 @@ def add( self, data: DataWrapper, key: str, - key_name: str = None, - key_class: ColumnClass = 'CUSTOM', + name: str = None, + column_class: ColumnClass = 'CUSTOM', + column_config: Optional[ColumnConfig] = None, section_type: SectionType = "PUBLIC", step: int = None, - column_config: Optional[ColumnConfig] = None, ) -> MetricInfo: """记录一条新的key数据 Parameters @@ -116,20 +125,20 @@ def add( data : DataWrapper 包装后的数据,用于数据解析 key : str - key的云端唯一标识 - key_name : str - key的实际名称, 默认与key相同 - key_class : str, optional - key的类型 + 列的云端唯一标识 + name : str + 列的实际名称, 默认与key相同 + column_class : str, optional + 列的类型 + column_config : Optional[ColumnConfig], optional + 列的额外配置信息 section_type : str, optional key的组类型 step : int, optional 步数,如果不传则默认当前步数为'已添加数据数量+1' 在log函数中已经做了处理,此处不需要考虑数值类型等情况 - column_config : Optional[ColumnConfig], optional - 列的额外配置信息 """ - m = self.__add(key, key_name, key_class, section_type, data, step) + m = self.__add(key, name, column_class, column_config, section_type, data, step) self.__operator.on_metric_create(m) return m @@ -291,8 +300,9 @@ def add(self, data: DataWrapper) -> MetricInfo: def create_column( self, key: str, - key_name: Optional[str], - key_class: ColumnClass, + name: Optional[str], + column_class: ColumnClass, + column_config: Optional[ColumnConfig], section_type: SectionType, data: DataWrapper, num: int, @@ -300,8 +310,9 @@ def create_column( """ 创建列信息,对当前key的基本信息做一个记录 :param key: str, key名称 - :param key_name: str, key的实际名称 - :param key_class: str, key的类型,CUSTOM为自定义key,SYSTEM为系统key + :param name: str, key的实际名称 + :param column_class: str, key的类型 + :param column_config: ColumnConfig, key的配置 :param section_type: str, key的组类型 :param data: DataType, 数据 :param num: 创建此列之前的列数量 @@ -320,8 +331,9 @@ def create_column( column_info = ColumnInfo( key=key, kid=str(num), - name=key_name, - cls=key_class, + name=name, + cls=column_class, + config=column_config, chart_type=result.chart, section_name=result.section, section_type=section_type, diff --git a/swanlab/data/run/helper.py b/swanlab/data/run/helper.py index 12dacb3d..74031a48 100644 --- a/swanlab/data/run/helper.py +++ b/swanlab/data/run/helper.py @@ -126,7 +126,7 @@ class MonitorCron: 用于定时采集系统信息 """ - SLEEP_TIME = 30 + SLEEP_TIME = 1 def __init__(self, monitor_func: Callable): def _(): diff --git a/swanlab/data/run/main.py b/swanlab/data/run/main.py index f21677a1..d31a16df 100644 --- a/swanlab/data/run/main.py +++ b/swanlab/data/run/main.py @@ -151,15 +151,22 @@ def monitor_func(): if monitor_info is None: swanlog.debug("Hardware info is empty. Skip it.") continue - key, name, value = monitor_info['key'], monitor_info['name'], monitor_info['value'] - v = DataWrapper(key, [Line(value)]) - self.__exp.add( - data=v, - key=key, - key_name=name, - key_class="SYSTEM", - section_type="SYSTEM", - ) + for info in monitor_info: + key, name, value, cfg = ( + info['key'], + info['name'], + info['value'], + info['config'], + ) + v = DataWrapper(key, [Line(value)]) + self.__exp.add( + data=v, + key=key, + name=name, + column_config=cfg, + # column_class="SYSTEM", + # section_type="SYSTEM", + ) return monitor_func diff --git a/swanlab/data/run/metadata/__init__.py b/swanlab/data/run/metadata/__init__.py index 31465203..97fb8b0b 100644 --- a/swanlab/data/run/metadata/__init__.py +++ b/swanlab/data/run/metadata/__init__.py @@ -14,7 +14,7 @@ from swanlab.package import get_package_version -def get_metadata(logdir: str) -> Tuple[dict, List[HardwareMonitorFunc]]: +def get_metadata(logdir: str) -> Tuple[dict, List[HardwareCollector]]: """ 采集实验的全部信息 """ diff --git a/swanlab/data/run/metadata/hardware/__init__.py b/swanlab/data/run/metadata/hardware/__init__.py index b1ebc317..18f2fb09 100644 --- a/swanlab/data/run/metadata/hardware/__init__.py +++ b/swanlab/data/run/metadata/hardware/__init__.py @@ -5,19 +5,19 @@ @description: 硬件信息采集 """ -from typing import Callable, List, Any, Optional +from typing import Callable, List, Any, Optional, Tuple from .cpu import get_cpu_info from .gpu.nvidia import get_nvidia_gpu_info from .memory import get_memory_size from .npu.ascend import get_ascend_npu_info from .soc.apple import get_apple_chip_info -from .type import HardwareFuncResult, HardwareMonitorFunc, HardwareInfo +from .type import HardwareFuncResult, HardwareCollector, HardwareInfo -__all__ = ["get_hardware_info", "HardwareMonitorFunc", "HardwareInfo"] +__all__ = ["get_hardware_info", "HardwareCollector", "HardwareInfo"] -def get_hardware_info() -> HardwareFuncResult: +def get_hardware_info() -> Tuple[Optional[Any], List[HardwareCollector]]: """ 采集硬件信息,包括CPU、GPU、内存、硬盘等 """ @@ -46,13 +46,14 @@ def get_hardware_info() -> HardwareFuncResult: def dec_hardware_func( func: Callable[[], HardwareFuncResult], - monitor_funcs: List[HardwareMonitorFunc], + monitor_funcs: List[HardwareCollector], ) -> Optional[Any]: """ 装饰器,用于记录硬件信息采集函数 """ x, y = func() - monitor_funcs.extend(y) + if y: + monitor_funcs.append(y) return x diff --git a/swanlab/data/run/metadata/hardware/cpu.py b/swanlab/data/run/metadata/hardware/cpu.py index e7da4dd3..6bd8aa4b 100644 --- a/swanlab/data/run/metadata/hardware/cpu.py +++ b/swanlab/data/run/metadata/hardware/cpu.py @@ -24,14 +24,14 @@ def get_cpu_info() -> HardwareFuncResult: else: # 其他情况,暂时不支持 # 苹果芯片单独处理 - return None, [] + return None, None try: # 获取 CPU 核心数 info["cores"] = multiprocessing.cpu_count() except Exception: # noqa pass - return info, [] + return info, None def get_cpu_brand_windows(): diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index 50233f7e..e814056d 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -30,7 +30,7 @@ def get_cuda_version(): try: pynvml.nvmlInit() except Exception: # noqa - return None, [] + return None, None try: # 获取 NVIDIA 驱动版本信息 @@ -60,4 +60,4 @@ def get_cuda_version(): finally: # 结束 NVML pynvml.nvmlShutdown() - return info, [] + return info, None diff --git a/swanlab/data/run/metadata/hardware/memory.py b/swanlab/data/run/metadata/hardware/memory.py index d6a1566c..adbef860 100644 --- a/swanlab/data/run/metadata/hardware/memory.py +++ b/swanlab/data/run/metadata/hardware/memory.py @@ -16,6 +16,6 @@ def get_memory_size() -> HardwareFuncResult: # 获取系统总内存大小 mem = psutil.virtual_memory() total_memory = round(mem.total / (1024**3)) # 单位为GB - return total_memory, [] + return total_memory, None except Exception: # noqa - return None, [] + return None, None diff --git a/swanlab/data/run/metadata/hardware/npu/ascend.py b/swanlab/data/run/metadata/hardware/npu/ascend.py index 36ddd92f..9120d3ce 100644 --- a/swanlab/data/run/metadata/hardware/npu/ascend.py +++ b/swanlab/data/run/metadata/hardware/npu/ascend.py @@ -19,11 +19,11 @@ def get_ascend_npu_info() -> HardwareFuncResult: """ # ascend芯片只支持Linux系统 if platform.system() != "Linux": - return None, [] + return None, None # /dev目录下没有davinci*设备文件,跳过 # 其实理论上davinci后接数字,代表此设备id,但是官方文档也没明确写,以防万一还是不这么干了 if not list(filter(lambda x: x.startswith("davinci"), os.listdir("/dev"))): - return None, [] + return None, None info = {"driver": None, "npu": None} try: # 获取NPU驱动版本 @@ -41,8 +41,8 @@ def get_ascend_npu_info() -> HardwareFuncResult: info["npu"][npu_id][chip_id] = {**chip_info, "usage": usage} except Exception: # noqa if all(v is None for v in info.values()): - return None, [] - return info, [] + return None, None + return info, None def get_version() -> str: diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index cc91d286..193c04d3 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -9,16 +9,17 @@ import multiprocessing import platform import subprocess +from typing import Optional, List import psutil +from swankit.callback.models import ColumnConfig -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareInfo -from swanlab.data.run.metadata.hardware.utils import hardware +from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareInfo, HardwareCollector def get_apple_chip_info() -> HardwareFuncResult: if "mac" not in platform.platform().lower(): - return None, [] + return None, None info = {"cpu": None, "gpu": None, "memory": None, "type": None} # 使用system_profiler命令以JSON格式获取GPU信息 @@ -31,17 +32,27 @@ def get_apple_chip_info() -> HardwareFuncResult: info["type"] = gpu_name info["memory"] = memory except Exception: # noqa - return None, [] + return None, None try: info["cpu"] = multiprocessing.cpu_count() except Exception: # noqa pass - return info, [ - # get_cpu_usage - ] + return info, AppleChipCollector() -@hardware -def get_cpu_usage() -> HardwareInfo: - usage = psutil.cpu_percent(interval=1) - return {"key": "cpu_usage", "value": usage, "name": "System CPU Utilization (%)"} +class AppleChipCollector(HardwareCollector): + def __init__(self): + self.cpu_config: ColumnConfig = {"y_range": (0, 100)} + + def collect(self) -> List[Optional[HardwareInfo]]: + info = [self.get_cpu_usage()] + return info + + def get_cpu_usage(self) -> HardwareInfo: + value = psutil.cpu_percent(interval=1) + return { + "key": "apple_cpu_usage", + "value": value, + "name": "System CPU Utilization (%)", + "config": self.cpu_config, + } diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index b9cfc85b..c4b21eff 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -5,7 +5,12 @@ @description: 硬件信息采集类型定义 """ -from typing import TypedDict, Callable, Tuple, Optional, Any, List, Union +from abc import ABC, abstractmethod +from typing import TypedDict, Tuple, Optional, Any, List, Union + +from swankit.callback.models import ColumnConfig + +from swanlab.log import swanlog # 定义硬件信息类型 @@ -16,10 +21,24 @@ class HardwareInfo(TypedDict): value: Union[str, int, float] # 硬件信息名称 name: str + # 相关配置 + config: Optional[ColumnConfig] + + +class HardwareCollector(ABC): + @abstractmethod + def collect(self) -> List[HardwareInfo]: + pass + def __call__(self): + try: + return self.collect() + except NotImplementedError as n: + raise n + except Exception as e: + swanlog.error("Hardware info collection failed: %s, %s", self.__class__.__name__, str(e)) + return None -# 定义硬件信息采集函数类型 -HardwareMonitorFunc = Callable[[], Optional[HardwareInfo]] # 定义硬件信息执行函数的返回结果 -HardwareFuncResult = Tuple[Optional[Any], List[HardwareMonitorFunc]] +HardwareFuncResult = Tuple[Optional[Any], Optional[HardwareCollector]] diff --git a/swanlab/data/run/metadata/hardware/utils.py b/swanlab/data/run/metadata/hardware/utils.py deleted file mode 100644 index f66e89f2..00000000 --- a/swanlab/data/run/metadata/hardware/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -@author: cunyue -@file: utils.py -@time: 2024/12/5 13:09 -@description: 硬件信息采集工具 -""" - -from swanlab.log import swanlog -from .type import HardwareMonitorFunc - - -def hardware(func: HardwareMonitorFunc) -> HardwareMonitorFunc: - """ - 硬件信息采集函数装饰器 - 如果函数执行失败返回None - """ - - def wrapper(): - try: - return func() - except Exception as e: - swanlog.error("Hardware info collection failed: %s, %s", func.__name__, str(e)) - return None - - return wrapper From a64c8747a49efafee3b0dab932ee573b1609bdbe Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:30:59 +0800 Subject: [PATCH 02/21] doc: system metric --- ...41\346\201\257\351\207\207\351\233\206.md" | 86 ++++++++++++++++++- requirements.txt | 1 - 2 files changed, 82 insertions(+), 5 deletions(-) diff --git "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" index 3d98208e..e7fc6d20 100644 --- "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" +++ "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" @@ -59,10 +59,88 @@ class MonitorCron: 简单来讲,所有采集器需要返回两部分内容,一部分是硬件整体信息,一部分是对应的硬件信息采集函数列表,前者在不同的采集器中不一样,但是后者需要遵循相同的规范: 1. 如果没什么好采集的,返回空列表 -2. - -如果有需要采集的,列表内的函数签名应该一致——此函数不接受任何参数,返回一个字典,字典类型为[HardwareInfo](/swanlab/data/run/metadata/hardware/type.py) +2. 如果有需要采集的,采集函数应该来自同一个基类,`__call__`魔术方法签名相同 > HardwareInfo字典类型遵循swanlab创建column的协议,包含列名称、图表配置、组配置等信息 -TODO \ No newline at end of file +目前swanlab的硬件信息采集主要依赖于`psutil`库。 + +### CPU + +注意:对于Apple系列的芯片的CPU信息,并不在此处采集和记录。 + +#### CPU Utilization (%) + +代表当前cpu的平均利用率。swanlab为它打了一个 `cpu` 标签。 + +#### CPU Utilization (per core) (%) + +代表当前cpu每个核心的利用率。swanlab为它打了一个 `cpu.{cpu_index}` 标签,其中cpu_index代表cpu的核心编号。 +所有核心的利用率将自动在一个图表中展示。 + +#### Process CPU Threads + +代表当前进程的CPU线程数。swanlab为它打了一个 `cpu.thds` 标签。 + +### Memory + +注意:对于Apple系列的芯片的内存信息,并不在此处采集和记录。 + +#### System Memory Utilization (%) + +代表当前系统的内存利用率。swanlab为它打了一个 `mem` 标签。 + +#### Process Memory In Use (non-swap) (MB) + +代表当前进程的内存利用率。swanlab为它打了一个 `mem.proc` 标签。 + +#### Process Memory In Use (non-swap) (%) + +代表当前进程的内存利用率。swanlab为它打了一个 `mem.proc.pct` 标签。 + +#### Process Memory Available (non-swap) (MB) + +代表当前进程的可用内存。swanlab为它打了一个 `mem.proc.avail` 标签。 + +### Apple SoC + +由于Apple SoC可能需要额外适配,因此这部分的cpu、内存信息以及(未来会加上的)GPU信息需要单独采集。注意,当前swanlab只针对M系列芯片做了硬件信息采集适配,早期intel芯片暂无额外调试,可能会存在问题。 +就目前而言,Apple的cpu信息、内存信息与上述的[CPU](#cpu)、[Memory](#memory)信息相同,标签也相同(因为同出自`psutil`库)。 + +### Nvidia GPU + +如果pynvml库可以识别到Nvidia GPU,swanlab还会采集Nvidia GPU的对应指标,他们的标签类似`gpu.{gpu_index}...`。 +同指标不同编号的GPU将自动在一个指标图表中展示。 + +#### GPU Memory Utilization (%) + +表示每个GPU的显存利用率百分比,swanlab为它打了一个 `gpu.{gpu_index}.mem.ptc` 标签。 + +#### GPU Temperature + +表示每个GPU的摄氏温度,swanlab为它打了一个 `gpu.{gpu_index}.temp` 标签。 + +#### GPU Power Usage Watts + +表示每个GPU的功耗,swanlab为它打了一个 `gpu.{gpu_index}.power` 标签。 + +### Ascend NPU + +如果swanlab识别到Ascend NPU,swanlab会采集Ascend NPU的对应指标,他们的标签类似`npu.{npu_index}...`。 +同指标不同编号的NPU将自动在一个指标图表中展示。 + +#### HBM Usage Rate (%) + +表示每个NPU的HBM利用率百分比,swanlab为它打了一个 `npu.{npu_index}.hbm.ptc` 标签。 + +#### NPU Temperature + +表示每个NPU的摄氏温度,swanlab为它打了一个 `npu.{npu_index}.temp` 标签。 + +## TODO + +在信息采集部分,未来还会上线: + +1. 更详细的GPU、NPU信息(利用率、时钟信息等) +2. 更多的硬件信息采集器(如硬盘、网络等) +3. 更多的计算设备支持(如AMD GPU、Google TPU等) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ca34334f..fcb39899 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,5 @@ requests>=2.25.0 click pyyaml psutil>=5.0.0 -gputil==1.4.0 pynvml rich From 0b3aec28a0f5c4f7eb6b76486f20ecf47ecbca81 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:01:25 +0800 Subject: [PATCH 03/21] feat: collect cpu and memory info --- ...41\346\201\257\351\207\207\351\233\206.md" | 2 +- requirements.txt | 2 +- swanlab/data/run/metadata/hardware/cpu.py | 14 +- swanlab/data/run/metadata/hardware/memory.py | 20 ++- .../data/run/metadata/hardware/soc/apple.py | 32 ++--- swanlab/data/run/metadata/hardware/type.py | 9 +- swanlab/data/run/metadata/hardware/utils.py | 129 ++++++++++++++++++ .../data/run/metadata/hardware/test_type.py | 25 ++++ .../data/run/metadata/hardware/test_utils.py | 41 ++++++ test/unit/data/run/metadata/test_utils.py | 27 ---- 10 files changed, 246 insertions(+), 55 deletions(-) create mode 100644 swanlab/data/run/metadata/hardware/utils.py create mode 100644 test/unit/data/run/metadata/hardware/test_type.py create mode 100644 test/unit/data/run/metadata/hardware/test_utils.py delete mode 100644 test/unit/data/run/metadata/test_utils.py diff --git "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" index e7fc6d20..f44ac576 100644 --- "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" +++ "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" @@ -71,7 +71,7 @@ class MonitorCron: #### CPU Utilization (%) -代表当前cpu的平均利用率。swanlab为它打了一个 `cpu` 标签。 +代表当前cpu的平均利用率。swanlab为它打了一个 `cpu.pct` 标签。 #### CPU Utilization (per core) (%) diff --git a/requirements.txt b/requirements.txt index fcb39899..54756343 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -swankit==0.1.2b2 +swankit==0.1.2b5 swanboard==0.1.7b1 cos-python-sdk-v5 urllib3>=1.26.0 diff --git a/swanlab/data/run/metadata/hardware/cpu.py b/swanlab/data/run/metadata/hardware/cpu.py index 6bd8aa4b..7fd9e1eb 100644 --- a/swanlab/data/run/metadata/hardware/cpu.py +++ b/swanlab/data/run/metadata/hardware/cpu.py @@ -9,7 +9,10 @@ import platform import subprocess -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult +import psutil + +from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareCollector, HardwareInfoList +from .utils import CpuCollector as C def get_cpu_info() -> HardwareFuncResult: @@ -55,3 +58,12 @@ def get_cpu_brand_linux(): return None except Exception: # noqa return None + + +class CpuCollector(HardwareCollector, C): + def __init__(self): + super().__init__() + self.current_process = psutil.Process() + + def collect(self) -> HardwareInfoList: + return [self.get_cpu_usage(), *self.get_per_cpu_usage(), self.get_cur_proc_thds_num(self.current_process)] diff --git a/swanlab/data/run/metadata/hardware/memory.py b/swanlab/data/run/metadata/hardware/memory.py index adbef860..ae03830d 100644 --- a/swanlab/data/run/metadata/hardware/memory.py +++ b/swanlab/data/run/metadata/hardware/memory.py @@ -5,17 +5,29 @@ @description: 内存信息采集 """ +from typing import List + import psutil -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult +from .type import HardwareFuncResult, HardwareCollector, HardwareInfo +from .utils import MemoryCollector as M def get_memory_size() -> HardwareFuncResult: """获取内存大小""" try: # 获取系统总内存大小 - mem = psutil.virtual_memory() - total_memory = round(mem.total / (1024**3)) # 单位为GB - return total_memory, None + total = psutil.virtual_memory().total + total_memory = round(total / (1024**3)) # 单位为GB + return total_memory, MemoryCollector() except Exception: # noqa return None, None + + +class MemoryCollector(HardwareCollector, M): + def __init__(self): + super().__init__() + self.current_process = psutil.Process() + + def collect(self) -> List[HardwareInfo]: + return [self.get_mem_usage(), *self.get_cur_proc_mem(self.current_process)] diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index 193c04d3..ee31b753 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -9,12 +9,11 @@ import multiprocessing import platform import subprocess -from typing import Optional, List import psutil -from swankit.callback.models import ColumnConfig -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareInfo, HardwareCollector +from ..type import HardwareFuncResult, HardwareCollector, HardwareInfoList +from ..utils import CpuCollector as C, MemoryCollector as M def get_apple_chip_info() -> HardwareFuncResult: @@ -40,19 +39,16 @@ def get_apple_chip_info() -> HardwareFuncResult: return info, AppleChipCollector() -class AppleChipCollector(HardwareCollector): +class AppleChipCollector(HardwareCollector, C, M): def __init__(self): - self.cpu_config: ColumnConfig = {"y_range": (0, 100)} - - def collect(self) -> List[Optional[HardwareInfo]]: - info = [self.get_cpu_usage()] - return info - - def get_cpu_usage(self) -> HardwareInfo: - value = psutil.cpu_percent(interval=1) - return { - "key": "apple_cpu_usage", - "value": value, - "name": "System CPU Utilization (%)", - "config": self.cpu_config, - } + super().__init__() + self.current_process = psutil.Process() + + def collect(self) -> HardwareInfoList: + return [ + self.get_cpu_usage(), + *self.get_per_cpu_usage(), + self.get_cur_proc_thds_num(self.current_process), + self.get_mem_usage(), + *self.get_cur_proc_mem(self.current_process), + ] diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index c4b21eff..11c027ad 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -25,18 +25,21 @@ class HardwareInfo(TypedDict): config: Optional[ColumnConfig] +HardwareInfoList = List[HardwareInfo] + + class HardwareCollector(ABC): @abstractmethod - def collect(self) -> List[HardwareInfo]: + def collect(self) -> HardwareInfoList: pass - def __call__(self): + def __call__(self) -> Optional[HardwareInfoList]: try: return self.collect() except NotImplementedError as n: raise n except Exception as e: - swanlog.error("Hardware info collection failed: %s, %s", self.__class__.__name__, str(e)) + swanlog.error(f"Hardware info collection failed: {self.__class__.__name__}, {str(e)}") return None diff --git a/swanlab/data/run/metadata/hardware/utils.py b/swanlab/data/run/metadata/hardware/utils.py new file mode 100644 index 00000000..189d6650 --- /dev/null +++ b/swanlab/data/run/metadata/hardware/utils.py @@ -0,0 +1,129 @@ +""" +@author: cunyue +@file: utils.py +@time: 2024/12/9 16:33 +@description: 硬件信息采集工具函数 +""" + +import random + +import psutil +from swankit.callback.models import ColumnConfig + +from .type import HardwareInfo, HardwareInfoList + +ALPHABET = "abcdefghijklmnopqrstuvwxyz0123456789" + + +def random_index(): + """ + 随机生成八位字符串,用于标识图表的index + """ + return "".join(random.choices(ALPHABET, k=8)) + + +class CpuCollector: + """ + cpu采集基类,为子类赋予cpu采集的能力 + """ + + CPU_CONFIG = ColumnConfig(y_range=(0, 100), chart_name="CPU Utilization (%)") + PER_CPU_CONFIG = ColumnConfig(y_range=(0, 100), chart_name="CPU Utilization (per core) (%)") + THDS_CONFIG = ColumnConfig(y_range=(0, None), chart_name="Process CPU Threads") + + def __init__(self): + self.per_cpu_configs = [] + # 随机生成一个index,用于标识图表 + self.per_cpu_usage_index = random_index() + + def get_cpu_usage(self) -> HardwareInfo: + """ + 获取当前 CPU 使用率 + """ + return { + "key": "cpu.pct", + "name": "CPU Utilization (%)", + "value": psutil.cpu_percent(interval=1), + "config": self.CPU_CONFIG, + } + + def get_per_cpu_usage(self) -> HardwareInfoList: + """ + 获取每个 CPU 核心的使用率 + """ + per_cpu_usage = psutil.cpu_percent(interval=1, percpu=True) + result: HardwareInfoList = [] + # 避免每次调用都创建新的配置 + if len(self.per_cpu_configs) != len(per_cpu_usage): + self.per_cpu_configs = [ + self.PER_CPU_CONFIG.clone(metric_name=f"CPU {idx}", chart_index=self.per_cpu_usage_index) + for idx in range(len(per_cpu_usage)) + ] + for idx, value in enumerate(per_cpu_usage): + info: HardwareInfo = { + "key": f"cpu.{idx}.pct", + "name": f"CPU {idx} Utilization (%)", + "value": value, + "config": self.per_cpu_configs[idx], + } + result.append(info) + return result + + def get_cur_proc_thds_num(self, proc: psutil.Process) -> HardwareInfo: + """ + 获取当前进程的线程数 + """ + return { + "key": "cpu.thds", + "name": "Process CPU Threads", + "value": proc.num_threads(), + "config": self.THDS_CONFIG, + } + + +class MemoryCollector: + """ + 内存采集基类,为子类赋予内存采集的能力 + """ + + MB = 1024 * 1024 + MEM_CONFIG = ColumnConfig(y_range=(0, 100)) + PROC_MEM_PCT_CONFIG = ColumnConfig(y_range=(0, 100)) + + def get_mem_usage(self) -> HardwareInfo: + """ + 获取当前系统内存使用率 + """ + return { + "key": "mem.pct", + "name": "System Memory Utilization (%)", + "value": psutil.virtual_memory().percent, + "config": self.MEM_CONFIG, + } + + def get_cur_proc_mem(self, proc: psutil.Process) -> HardwareInfoList: + """ + 获取当前进程的内存使用情况 + """ + mem_info = proc.memory_info() + virtual_memory = psutil.virtual_memory() + mem_proc: HardwareInfo = { + "key": "mem.proc", + "name": "Process Memory In Use (non-swap) (MB)", + "value": mem_info.rss / self.MB, + "config": None, + } + mem_proc_pct: HardwareInfo = { + "key": "mem.proc.pct", + "name": "Process Memory Utilization (%)", + "value": proc.memory_percent(), + "config": self.PROC_MEM_PCT_CONFIG, + } + mem_proc_avail: HardwareInfo = { + "key": "mem.proc.avail", + "name": "Process Memory Available (MB)", + "value": virtual_memory.available / self.MB, + "config": None, + } + + return [mem_proc, mem_proc_pct, mem_proc_avail] diff --git a/test/unit/data/run/metadata/hardware/test_type.py b/test/unit/data/run/metadata/hardware/test_type.py new file mode 100644 index 00000000..9c9697a5 --- /dev/null +++ b/test/unit/data/run/metadata/hardware/test_type.py @@ -0,0 +1,25 @@ +""" +@author: cunyue +@file: test_utils.py +@time: 2024/12/5 13:27 +@description: 硬件信息采集工具测试 +""" + +from swanlab.data.run.metadata.hardware.type import HardwareCollector +from swanlab.data.run.metadata.hardware.type import HardwareInfo + + +def test_hardware(): + class TestCollector(HardwareCollector): + def collect(self) -> HardwareInfo: + return {"key": "test", "value": 1, "name": "test", "config": None} + + t = TestCollector() + assert t.collect() == {"key": "test", "value": 1, "name": "test", "config": None} + + class TestErrorCollector(HardwareCollector): + def collect(self) -> HardwareInfo: + raise Exception("test") + + t = TestErrorCollector() + assert t() is None diff --git a/test/unit/data/run/metadata/hardware/test_utils.py b/test/unit/data/run/metadata/hardware/test_utils.py new file mode 100644 index 00000000..d76973ba --- /dev/null +++ b/test/unit/data/run/metadata/hardware/test_utils.py @@ -0,0 +1,41 @@ +""" +@author: cunyue +@file: test_utils.py +@time: 2024/12/9 20:55 +@description: 测试硬件信息采集工具 +""" + +from swanlab.data.run.metadata.hardware.utils import random_index, CpuCollector + + +def test_random_index(): + s = random_index() + assert len(s) == 8 + assert s.isalnum() + + +def test_cpu_usage(): + c = CpuCollector() + usage = c.get_cpu_usage() + assert usage is not None + assert 0 <= usage["value"] <= 100 + assert usage["key"] == "cpu.pct" + assert usage["name"] == "CPU Utilization (%)" + assert usage["config"].y_range == (0, 100) + assert usage["config"].chart_name == "CPU Utilization (%)" + assert usage["config"].chart_index is None + + +def test_per_cpu_usage(): + c = CpuCollector() + usage = c.get_per_cpu_usage() + assert usage is not None + for idx, u in enumerate(usage): + assert 0 <= u["value"] <= 100 + assert u["key"] == f"cpu.{idx}.pct" + assert u["name"] == f"CPU {idx} Utilization (%)" + assert u["config"].y_range == (0, 100) + assert u["config"].chart_name == f"CPU Utilization (per core) (%)" + # 每个核心的index应该相同,因为必须要放在同一个图表中 + assert u["config"].chart_index == c.per_cpu_usage_index + assert u["config"].metric_name == f"CPU {idx}" diff --git a/test/unit/data/run/metadata/test_utils.py b/test/unit/data/run/metadata/test_utils.py deleted file mode 100644 index 55c3c160..00000000 --- a/test/unit/data/run/metadata/test_utils.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -@author: cunyue -@file: test_utils.py -@time: 2024/12/5 13:27 -@description: 硬件信息采集工具测试 -""" - -from swanlab.data.run.metadata.hardware.type import HardwareInfo -from swanlab.data.run.metadata.hardware.utils import hardware - - -def test_hardware(): - - @hardware - def func() -> HardwareInfo: - return {"value": 1, "name": "test", "key": "12345"} - - assert func() == {"key": "12345", "value": 1, "name": "test"} - - -def test_hardware_err(): - - @hardware - def func() -> HardwareInfo: - raise Exception("test error") - - assert func() is None From 7c09c4cc55d4af0d1945506b2a5f626347b93b95 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:15:48 +0800 Subject: [PATCH 04/21] feat: random key prefix --- swanlab/data/run/metadata/hardware/utils.py | 38 +++++++++++++------ .../data/run/metadata/hardware/test_utils.py | 14 ++++++- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/swanlab/data/run/metadata/hardware/utils.py b/swanlab/data/run/metadata/hardware/utils.py index 189d6650..89817013 100644 --- a/swanlab/data/run/metadata/hardware/utils.py +++ b/swanlab/data/run/metadata/hardware/utils.py @@ -15,11 +15,18 @@ ALPHABET = "abcdefghijklmnopqrstuvwxyz0123456789" -def random_index(): +def random_index(length: int = 8) -> str: """ 随机生成八位字符串,用于标识图表的index """ - return "".join(random.choices(ALPHABET, k=8)) + return "".join(random.choices(ALPHABET, k=length)) + + +def generate_key(suffix: str, length: int = 4) -> str: + """ + 生成key,用于标识系统列,避免与用户输入的key冲突 + """ + return "".join(random.choices(ALPHABET, k=length)) + "." + suffix class CpuCollector: @@ -34,14 +41,17 @@ class CpuCollector: def __init__(self): self.per_cpu_configs = [] # 随机生成一个index,用于标识图表 - self.per_cpu_usage_index = random_index() + self.per_cpu_usage_chart_index = random_index() + self.cpu_usage_key = generate_key("cpu.pct") + self.per_cpu_usage_key = generate_key("cpu.{idx}.pct") + self.proc_thds_key = generate_key("cpu.thds") def get_cpu_usage(self) -> HardwareInfo: """ 获取当前 CPU 使用率 """ return { - "key": "cpu.pct", + "key": self.cpu_usage_key, "name": "CPU Utilization (%)", "value": psutil.cpu_percent(interval=1), "config": self.CPU_CONFIG, @@ -56,12 +66,12 @@ def get_per_cpu_usage(self) -> HardwareInfoList: # 避免每次调用都创建新的配置 if len(self.per_cpu_configs) != len(per_cpu_usage): self.per_cpu_configs = [ - self.PER_CPU_CONFIG.clone(metric_name=f"CPU {idx}", chart_index=self.per_cpu_usage_index) + self.PER_CPU_CONFIG.clone(metric_name=f"CPU {idx}", chart_index=self.per_cpu_usage_chart_index) for idx in range(len(per_cpu_usage)) ] for idx, value in enumerate(per_cpu_usage): info: HardwareInfo = { - "key": f"cpu.{idx}.pct", + "key": self.per_cpu_usage_key.format(idx=idx), "name": f"CPU {idx} Utilization (%)", "value": value, "config": self.per_cpu_configs[idx], @@ -74,7 +84,7 @@ def get_cur_proc_thds_num(self, proc: psutil.Process) -> HardwareInfo: 获取当前进程的线程数 """ return { - "key": "cpu.thds", + "key": self.proc_thds_key, "name": "Process CPU Threads", "value": proc.num_threads(), "config": self.THDS_CONFIG, @@ -90,12 +100,18 @@ class MemoryCollector: MEM_CONFIG = ColumnConfig(y_range=(0, 100)) PROC_MEM_PCT_CONFIG = ColumnConfig(y_range=(0, 100)) + def __init__(self): + self.mem_usage_key = generate_key("mem.pct") + self.mem_proc_key = generate_key("mem.proc") + self.mem_proc_pct_key = generate_key("mem.proc.pct") + self.mem_proc_avail_key = generate_key("mem.proc.avail") + def get_mem_usage(self) -> HardwareInfo: """ 获取当前系统内存使用率 """ return { - "key": "mem.pct", + "key": self.mem_usage_key, "name": "System Memory Utilization (%)", "value": psutil.virtual_memory().percent, "config": self.MEM_CONFIG, @@ -108,19 +124,19 @@ def get_cur_proc_mem(self, proc: psutil.Process) -> HardwareInfoList: mem_info = proc.memory_info() virtual_memory = psutil.virtual_memory() mem_proc: HardwareInfo = { - "key": "mem.proc", + "key": self.mem_proc_key, "name": "Process Memory In Use (non-swap) (MB)", "value": mem_info.rss / self.MB, "config": None, } mem_proc_pct: HardwareInfo = { - "key": "mem.proc.pct", + "key": self.mem_proc_pct_key, "name": "Process Memory Utilization (%)", "value": proc.memory_percent(), "config": self.PROC_MEM_PCT_CONFIG, } mem_proc_avail: HardwareInfo = { - "key": "mem.proc.avail", + "key": self.mem_proc_avail_key, "name": "Process Memory Available (MB)", "value": virtual_memory.available / self.MB, "config": None, diff --git a/test/unit/data/run/metadata/hardware/test_utils.py b/test/unit/data/run/metadata/hardware/test_utils.py index d76973ba..5d6f8890 100644 --- a/test/unit/data/run/metadata/hardware/test_utils.py +++ b/test/unit/data/run/metadata/hardware/test_utils.py @@ -5,13 +5,25 @@ @description: 测试硬件信息采集工具 """ -from swanlab.data.run.metadata.hardware.utils import random_index, CpuCollector +from swanlab.data.run.metadata.hardware.utils import random_index, CpuCollector, generate_key def test_random_index(): s = random_index() assert len(s) == 8 assert s.isalnum() + s = random_index(10) + assert len(s) == 10 + assert s.isalnum() + + +def test_generate_key(): + s = generate_key("test") + assert len(s) == 9 + assert s.endswith(".test") + s = generate_key("test", 10) + assert len(s) == 15 + assert s.endswith(".test") def test_cpu_usage(): From a33ada6f33b530770675fcc338064d844bebd610 Mon Sep 17 00:00:00 2001 From: cunyue Date: Tue, 10 Dec 2024 11:34:08 +0000 Subject: [PATCH 05/21] feat: nvidia gpu info --- .../data/run/metadata/hardware/gpu/nvidia.py | 119 +++++++++++++++--- .../run/metadata/hardware/gpu/test_nvidia.py | 52 ++++++++ 2 files changed, 157 insertions(+), 14 deletions(-) create mode 100644 test/unit/data/run/metadata/hardware/gpu/test_nvidia.py diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index e814056d..ccc6dc49 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -9,23 +9,13 @@ import pynvml -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult +from ..utils import generate_key, ColumnConfig, random_index +from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareCollector, HardwareInfoList, HardwareInfo def get_nvidia_gpu_info() -> HardwareFuncResult: """获取 GPU 信息""" - def get_cuda_version(): - """获取 CUDA 版本""" - try: - output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8") - for line in output.split('\n'): - if "release" in line: - version = line.split("release")[-1].strip().split(" ")[0][:-1] - return version - except Exception: # noqa - return None - info = {"driver": None, "cores": None, "type": [], "memory": [], "cuda": None} try: pynvml.nvmlInit() @@ -58,6 +48,107 @@ def get_cuda_version(): except pynvml.NVMLError: pass finally: - # 结束 NVML + if info['cores'] is None: + # 结束 NVML + pynvml.nvmlShutdown() + return info, None + return info, GpuCollector() + +def get_cuda_version(): + """获取 CUDA 版本""" + try: + output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8") + for line in output.split('\n'): + if "release" in line: + version = line.split("release")[-1].strip().split(" ")[0][:-1] + return version + except Exception: # noqa + return None + +class GpuCollector(HardwareCollector): + + def __init__(self): + super().__init__() + count = int(pynvml.nvmlDeviceGetCount()) + self.gpu_mem_pct_key = generate_key("gpu.{idx}.mem.ptc") + mem_pct_config = ColumnConfig(y_range=(0, 100), chart_name="GPU Utilization (%)", chart_index=random_index()) + self.gpu_temp_key = generate_key("gpu.{idx}.temp") + tem_config = ColumnConfig(chart_name="GPU Temperature (℃)", chart_index=random_index()) + self.gpu_power_key = generate_key("gpu.{idx}.power") + power_config = ColumnConfig(chart_name="GPU Power Usage (W)", chart_index=random_index()) + self.per_gpu_configs = { + self.gpu_mem_pct_key: [], + self.gpu_temp_key: [], + self.gpu_power_key: [] + } + self.handles = [] + for idx in range(count): + metric_name = "GPU {idx}".format(idx=idx) + self.per_gpu_configs[self.gpu_mem_pct_key].append(mem_pct_config.clone(metric_name=metric_name)) + self.per_gpu_configs[self.gpu_temp_key].append(tem_config.clone(metric_name=metric_name)) + self.per_gpu_configs[self.gpu_power_key].append(power_config.clone(metric_name=metric_name)) + self.handles.append(pynvml.nvmlDeviceGetHandleByIndex(idx)) + self.count = count + + def get_gpu_config(self, key: str, idx:int) -> ColumnConfig: + """ + 获取 某个GPU的某个配置信息 + """ + return self.per_gpu_configs[key][idx] + + + def get_gpu_mem_pct(self, idx: int, handle) -> HardwareInfo: + """ + 获取 GPU 内存使用率 + """ + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + mem_pct = mem_info.used / mem_info.total * 100 + return { + "key": self.gpu_mem_pct_key.format(idx=idx), + "value": mem_pct, + "name": "GPU {idx} Utilization (%)".format(idx=idx), + "config": self.get_gpu_config(self.gpu_mem_pct_key, idx) + } + + + def get_gpu_temp(self, idx: int, handle) -> HardwareInfo: + """ + 获取 GPU 温度 + """ + temp_info = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) + return { + "key": self.gpu_temp_key.format(idx=idx), + "value": temp_info, + "name": "GPU {idx} Temperature (℃)".format(idx=idx), + "config": self.get_gpu_config(self.gpu_temp_key, idx) + } + + + def get_gpu_power(self, idx: int, handle) -> HardwareInfo: + """ + 获取 GPU 功耗 + """ + # 功耗单位为mW,转换为W + power_info = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000 + return { + "key": self.gpu_power_key.format(idx=idx), + "value": power_info, + "name": "GPU {idx} Power Usage (W)".format(idx=idx), + "config": self.get_gpu_config(self.gpu_power_key, idx) + } + + def collect(self) -> HardwareInfoList: + """ + 采集信息 + """ + info_list: HardwareInfoList = [] + for idx, handle in enumerate(self.handles): + info_list.append(self.get_gpu_mem_pct(idx, handle)) + info_list.append(self.get_gpu_temp(idx, handle)) + info_list.append(self.get_gpu_power(idx, handle)) + return info_list + + def __del__(self): pynvml.nvmlShutdown() - return info, None + + \ No newline at end of file diff --git a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py new file mode 100644 index 00000000..6e659a88 --- /dev/null +++ b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py @@ -0,0 +1,52 @@ +""" +@author: cunyue +@file: test_nvidia.py +@time: 2024/12/5 13:27 +@description: 测试NVIDIA GPU信息采集 +""" +import pynvml +import pytest +from swanlab.data.run.metadata.hardware.gpu.nvidia import GpuCollector +try: + pynvml.nvmlInit() + count = pynvml.nvmlDeviceGetCount() +except Exception: # noqa + count = 0 + +@pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") +def test_get_mem(): + collector = GpuCollector() + # 获取handle + idx = 0 + handle = pynvml.nvmlDeviceGetHandleByIndex(idx) + mem = collector.get_gpu_mem_pct(idx=idx, handle=handle) + assert mem['name'] == "GPU 0 Utilization (%)" + assert mem['config'].y_range == (0, 100) + assert mem['config'].metric_name == "GPU 0" + assert 100>=mem['value'] >= 0 + +@pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") +def test_get_temp(): + collector = GpuCollector() + # 获取handle + idx = 0 + handle = pynvml.nvmlDeviceGetHandleByIndex(idx) + temp = collector.get_gpu_temp(idx=idx, handle=handle) + assert temp['name'] == "GPU 0 Temperature (℃)" + assert temp['config'].y_range is None + assert temp['config'].metric_name == "GPU 0" + assert temp['value'] >= 0 + assert temp['config'].metric_name == "GPU 0" + +@pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") +def test_get_power(): + collector = GpuCollector() + # 获取handle + idx = 0 + handle = pynvml.nvmlDeviceGetHandleByIndex(idx) + power = collector.get_gpu_power(idx=idx, handle=handle) + assert power['name'] == "GPU 0 Power Usage (W)" + assert power['config'].y_range is None + assert power['config'].metric_name == "GPU 0" + assert power['value'] >= 0 + assert power['config'].metric_name == "GPU 0" \ No newline at end of file From 3655f9dff58fbfd052926a1b7d34946d9e803de5 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:01:19 +0800 Subject: [PATCH 06/21] refatcor: code --- ...41\346\201\257\351\207\207\351\233\206.md" | 6 +- .../data/run/metadata/hardware/gpu/nvidia.py | 56 +++++++++---------- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" index f44ac576..83e76c13 100644 --- "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" +++ "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" @@ -112,15 +112,15 @@ class MonitorCron: 如果pynvml库可以识别到Nvidia GPU,swanlab还会采集Nvidia GPU的对应指标,他们的标签类似`gpu.{gpu_index}...`。 同指标不同编号的GPU将自动在一个指标图表中展示。 -#### GPU Memory Utilization (%) +#### GPU Utilization (%) 表示每个GPU的显存利用率百分比,swanlab为它打了一个 `gpu.{gpu_index}.mem.ptc` 标签。 -#### GPU Temperature +#### GPU Temperature (℃) 表示每个GPU的摄氏温度,swanlab为它打了一个 `gpu.{gpu_index}.temp` 标签。 -#### GPU Power Usage Watts +#### GPU Power Usage (W) 表示每个GPU的功耗,swanlab为它打了一个 `gpu.{gpu_index}.power` 标签。 diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index ccc6dc49..f036ec04 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -9,8 +9,8 @@ import pynvml +from ..type import HardwareFuncResult, HardwareCollector, HardwareInfoList, HardwareInfo from ..utils import generate_key, ColumnConfig, random_index -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareCollector, HardwareInfoList, HardwareInfo def get_nvidia_gpu_info() -> HardwareFuncResult: @@ -48,11 +48,9 @@ def get_nvidia_gpu_info() -> HardwareFuncResult: except pynvml.NVMLError: pass finally: - if info['cores'] is None: - # 结束 NVML - pynvml.nvmlShutdown() - return info, None - return info, GpuCollector() + pynvml.nvmlShutdown() + return info, None if not info["cores"] else GpuCollector(int(info["cores"])) + def get_cuda_version(): """获取 CUDA 版本""" @@ -65,22 +63,18 @@ def get_cuda_version(): except Exception: # noqa return None + class GpuCollector(HardwareCollector): - - def __init__(self): + + def __init__(self, count: int): super().__init__() - count = int(pynvml.nvmlDeviceGetCount()) self.gpu_mem_pct_key = generate_key("gpu.{idx}.mem.ptc") mem_pct_config = ColumnConfig(y_range=(0, 100), chart_name="GPU Utilization (%)", chart_index=random_index()) self.gpu_temp_key = generate_key("gpu.{idx}.temp") tem_config = ColumnConfig(chart_name="GPU Temperature (℃)", chart_index=random_index()) self.gpu_power_key = generate_key("gpu.{idx}.power") power_config = ColumnConfig(chart_name="GPU Power Usage (W)", chart_index=random_index()) - self.per_gpu_configs = { - self.gpu_mem_pct_key: [], - self.gpu_temp_key: [], - self.gpu_power_key: [] - } + self.per_gpu_configs = {self.gpu_mem_pct_key: [], self.gpu_temp_key: [], self.gpu_power_key: []} self.handles = [] for idx in range(count): metric_name = "GPU {idx}".format(idx=idx) @@ -88,15 +82,13 @@ def __init__(self): self.per_gpu_configs[self.gpu_temp_key].append(tem_config.clone(metric_name=metric_name)) self.per_gpu_configs[self.gpu_power_key].append(power_config.clone(metric_name=metric_name)) self.handles.append(pynvml.nvmlDeviceGetHandleByIndex(idx)) - self.count = count - - def get_gpu_config(self, key: str, idx:int) -> ColumnConfig: + + def get_gpu_config(self, key: str, idx: int) -> ColumnConfig: """ 获取 某个GPU的某个配置信息 """ return self.per_gpu_configs[key][idx] - - + def get_gpu_mem_pct(self, idx: int, handle) -> HardwareInfo: """ 获取 GPU 内存使用率 @@ -107,10 +99,9 @@ def get_gpu_mem_pct(self, idx: int, handle) -> HardwareInfo: "key": self.gpu_mem_pct_key.format(idx=idx), "value": mem_pct, "name": "GPU {idx} Utilization (%)".format(idx=idx), - "config": self.get_gpu_config(self.gpu_mem_pct_key, idx) + "config": self.get_gpu_config(self.gpu_mem_pct_key, idx), } - def get_gpu_temp(self, idx: int, handle) -> HardwareInfo: """ 获取 GPU 温度 @@ -120,10 +111,9 @@ def get_gpu_temp(self, idx: int, handle) -> HardwareInfo: "key": self.gpu_temp_key.format(idx=idx), "value": temp_info, "name": "GPU {idx} Temperature (℃)".format(idx=idx), - "config": self.get_gpu_config(self.gpu_temp_key, idx) + "config": self.get_gpu_config(self.gpu_temp_key, idx), } - def get_gpu_power(self, idx: int, handle) -> HardwareInfo: """ 获取 GPU 功耗 @@ -134,21 +124,25 @@ def get_gpu_power(self, idx: int, handle) -> HardwareInfo: "key": self.gpu_power_key.format(idx=idx), "value": power_info, "name": "GPU {idx} Power Usage (W)".format(idx=idx), - "config": self.get_gpu_config(self.gpu_power_key, idx) + "config": self.get_gpu_config(self.gpu_power_key, idx), } - + def collect(self) -> HardwareInfoList: """ 采集信息 """ info_list: HardwareInfoList = [] - for idx, handle in enumerate(self.handles): - info_list.append(self.get_gpu_mem_pct(idx, handle)) - info_list.append(self.get_gpu_temp(idx, handle)) - info_list.append(self.get_gpu_power(idx, handle)) + # 信息采集为低频需求,大概每半分钟到一分钟采集一次 + # 因此每次采集前初始化一次,这样可以避免内存泄漏 + try: + pynvml.nvmlInit() + for idx, handle in enumerate(self.handles): + info_list.append(self.get_gpu_mem_pct(idx, handle)) + info_list.append(self.get_gpu_temp(idx, handle)) + info_list.append(self.get_gpu_power(idx, handle)) + finally: + pynvml.nvmlShutdown() return info_list def __del__(self): pynvml.nvmlShutdown() - - \ No newline at end of file From 70b25de05ca1a0d64701fdbfb0b14b744a8bbbf1 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:34:13 +0800 Subject: [PATCH 07/21] feat: collect guard --- swanlab/data/run/helper.py | 20 ++++++-- .../data/run/metadata/hardware/gpu/nvidia.py | 26 ++++++---- .../data/run/metadata/hardware/npu/ascend.py | 2 - .../data/run/metadata/hardware/soc/apple.py | 4 +- swanlab/data/run/metadata/hardware/type.py | 48 ++++++++++++++++++- swanlab/data/run/metadata/hardware/utils.py | 22 ++++----- .../run/metadata/hardware/soc/test_apple.py | 18 +++++++ .../data/run/metadata/hardware/test_type.py | 47 +++++++++++++++++- 8 files changed, 155 insertions(+), 32 deletions(-) create mode 100644 test/unit/data/run/metadata/hardware/soc/test_apple.py diff --git a/swanlab/data/run/helper.py b/swanlab/data/run/helper.py index 74031a48..f49adddf 100644 --- a/swanlab/data/run/helper.py +++ b/swanlab/data/run/helper.py @@ -126,15 +126,17 @@ class MonitorCron: 用于定时采集系统信息 """ - SLEEP_TIME = 1 - def __init__(self, monitor_func: Callable): + self.count = 0 # 计数器,执行次数 + def _(): monitor_func() - self.timer = threading.Timer(self.SLEEP_TIME, _) + self.count += 1 + self.timer = threading.Timer(self.sleep_time, _) self.timer.daemon = True self.timer.start() + # 立即执行 self.timer = threading.Timer(0, _) self.timer.daemon = True self.timer.start() @@ -143,6 +145,18 @@ def cancel(self): if self.timer is not None: self.timer.cancel() + @property + def sleep_time(self): + # 采集10次以下,每次间隔10秒 + # 采集10次到50次,每次间隔30秒 + # 采集50次以上,每次间隔60秒 + if self.count < 10: + return 1 + elif self.count < 50: + return 3 + else: + return 6 + def check_log_level(log_level: Optional[str]) -> str: """检查日志等级是否合法""" diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index f036ec04..e22fc1a9 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -9,6 +9,7 @@ import pynvml +from swanlab.log import swanlog from ..type import HardwareFuncResult, HardwareCollector, HardwareInfoList, HardwareInfo from ..utils import generate_key, ColumnConfig, random_index @@ -132,17 +133,22 @@ def collect(self) -> HardwareInfoList: 采集信息 """ info_list: HardwareInfoList = [] - # 信息采集为低频需求,大概每半分钟到一分钟采集一次 - # 因此每次采集前初始化一次,这样可以避免内存泄漏 - try: - pynvml.nvmlInit() - for idx, handle in enumerate(self.handles): - info_list.append(self.get_gpu_mem_pct(idx, handle)) - info_list.append(self.get_gpu_temp(idx, handle)) - info_list.append(self.get_gpu_power(idx, handle)) - finally: - pynvml.nvmlShutdown() + # 低频采集下(30s以下),应该每次采集时都执行pynvml.nvmlInit() + # 高频采集下(30s以上),应该在初始化时执行pynvml.nvmlInit(),在最后一次采集时执行pynvml.nvmlShutdown() + # 在外部定时任务处,超过10次即变为低频采集,因此需要判断一下 + for idx, handle in enumerate(self.handles): + info_list.append(self.get_gpu_mem_pct(idx, handle)) + info_list.append(self.get_gpu_temp(idx, handle)) + info_list.append(self.get_gpu_power(idx, handle)) return info_list def __del__(self): pynvml.nvmlShutdown() + + def before_collect_impl(self): + pynvml.nvmlInit() + swanlog.debug("NVIDIA GPU nvml inited.") + + def after_collect_impl(self): + pynvml.nvmlShutdown() + swanlog.debug("NVIDIA GPU nvml shutdown.") diff --git a/swanlab/data/run/metadata/hardware/npu/ascend.py b/swanlab/data/run/metadata/hardware/npu/ascend.py index 9120d3ce..aa16ec82 100644 --- a/swanlab/data/run/metadata/hardware/npu/ascend.py +++ b/swanlab/data/run/metadata/hardware/npu/ascend.py @@ -53,7 +53,6 @@ def get_version() -> str: def map_npu() -> dict: """ 列出所有NPU设备,并包含芯片的映射关系 - """ output = subprocess.run(["npu-smi", "info", "-m"], capture_output=True, check=True, text=True).stdout # npu_id -> chip_id -> {"id": chip_logic_id, "name": chip_name} @@ -80,7 +79,6 @@ def map_npu() -> dict: def get_chip_usage(npu_id: str, chip_id: str): """ 获取某个NPU设备的芯片信息 - 不再需要获取chip的名称 """ output = subprocess.run( ["npu-smi", "info", "-t", "usages", "-i", npu_id, "-c", chip_id], diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index ee31b753..0a5d57af 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -12,7 +12,7 @@ import psutil -from ..type import HardwareFuncResult, HardwareCollector, HardwareInfoList +from ..type import HardwareFuncResult, HardwareCollector as H, HardwareInfoList from ..utils import CpuCollector as C, MemoryCollector as M @@ -39,7 +39,7 @@ def get_apple_chip_info() -> HardwareFuncResult: return info, AppleChipCollector() -class AppleChipCollector(HardwareCollector, C, M): +class AppleChipCollector(H, C, M): def __init__(self): super().__init__() self.current_process = psutil.Process() diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index 11c027ad..ac5299d0 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -28,14 +28,58 @@ class HardwareInfo(TypedDict): HardwareInfoList = List[HardwareInfo] -class HardwareCollector(ABC): +class CollectGuard: + """ + 采集守卫,在采集任务执行前后可以选择执行一些操作 + 这些操作可能比较耗费资源,因此需要选择性执行,而选择的依据是是执行的次数 + 这与任务的采集间隔有关,即此类适配了 MonitorCron 的采集间隔 + """ + + def __init__(self): + self.collect_num = 0 + + def before_collect(self): + try: + # count=0,执行一次 + if self.collect_num == 0: + return self.before_collect_impl() + # 60>count>0, 不执行 + elif self.collect_num < 60: + return + else: + return self.before_collect_impl() + finally: + self.collect_num += 1 + + def after_collect(self): + # count=1, 执行一次 + if self.collect_num == 1: + return self.after_collect_impl() + # 61>count>1, 不执行 + elif self.collect_num < 61: + return + else: + return self.after_collect_impl() + + def before_collect_impl(self): + pass + + def after_collect_impl(self): + pass + + +class HardwareCollector(CollectGuard, ABC): + @abstractmethod def collect(self) -> HardwareInfoList: pass def __call__(self) -> Optional[HardwareInfoList]: try: - return self.collect() + self.before_collect() + self.collect() + self.after_collect() + return except NotImplementedError as n: raise n except Exception as e: diff --git a/swanlab/data/run/metadata/hardware/utils.py b/swanlab/data/run/metadata/hardware/utils.py index 89817013..137e0841 100644 --- a/swanlab/data/run/metadata/hardware/utils.py +++ b/swanlab/data/run/metadata/hardware/utils.py @@ -38,13 +38,12 @@ class CpuCollector: PER_CPU_CONFIG = ColumnConfig(y_range=(0, 100), chart_name="CPU Utilization (per core) (%)") THDS_CONFIG = ColumnConfig(y_range=(0, None), chart_name="Process CPU Threads") - def __init__(self): - self.per_cpu_configs = [] - # 随机生成一个index,用于标识图表 - self.per_cpu_usage_chart_index = random_index() - self.cpu_usage_key = generate_key("cpu.pct") - self.per_cpu_usage_key = generate_key("cpu.{idx}.pct") - self.proc_thds_key = generate_key("cpu.thds") + per_cpu_configs = [] + # 随机生成一个index,用于标识图表 + per_cpu_usage_chart_index = random_index() + cpu_usage_key = generate_key("cpu.pct") + per_cpu_usage_key = generate_key("cpu.{idx}.pct") + proc_thds_key = generate_key("cpu.thds") def get_cpu_usage(self) -> HardwareInfo: """ @@ -100,11 +99,10 @@ class MemoryCollector: MEM_CONFIG = ColumnConfig(y_range=(0, 100)) PROC_MEM_PCT_CONFIG = ColumnConfig(y_range=(0, 100)) - def __init__(self): - self.mem_usage_key = generate_key("mem.pct") - self.mem_proc_key = generate_key("mem.proc") - self.mem_proc_pct_key = generate_key("mem.proc.pct") - self.mem_proc_avail_key = generate_key("mem.proc.avail") + mem_usage_key = generate_key("mem.pct") + mem_proc_key = generate_key("mem.proc") + mem_proc_pct_key = generate_key("mem.proc.pct") + mem_proc_avail_key = generate_key("mem.proc.avail") def get_mem_usage(self) -> HardwareInfo: """ diff --git a/test/unit/data/run/metadata/hardware/soc/test_apple.py b/test/unit/data/run/metadata/hardware/soc/test_apple.py new file mode 100644 index 00000000..a57d3aae --- /dev/null +++ b/test/unit/data/run/metadata/hardware/soc/test_apple.py @@ -0,0 +1,18 @@ +""" +@author: cunyue +@file: test_apple.py +@time: 2024/12/10 20:33 +@description: 测试苹果芯片信息采集 +""" + +import pytest +from swankit.env import is_macos + +from swanlab.data.run.metadata.hardware.soc.apple import AppleChipCollector + + +@pytest.mark.skipif(not is_macos(), reason="Apple chip info only available on macOS") +def test_apple_collector(): + apple = AppleChipCollector() + apple() + assert apple.collect_num == 1 diff --git a/test/unit/data/run/metadata/hardware/test_type.py b/test/unit/data/run/metadata/hardware/test_type.py index 9c9697a5..1df25fce 100644 --- a/test/unit/data/run/metadata/hardware/test_type.py +++ b/test/unit/data/run/metadata/hardware/test_type.py @@ -5,7 +5,7 @@ @description: 硬件信息采集工具测试 """ -from swanlab.data.run.metadata.hardware.type import HardwareCollector +from swanlab.data.run.metadata.hardware.type import HardwareCollector, CollectGuard from swanlab.data.run.metadata.hardware.type import HardwareInfo @@ -23,3 +23,48 @@ def collect(self) -> HardwareInfo: t = TestErrorCollector() assert t() is None + + +def test_collect_guard(): + class TestGuard(CollectGuard): + def __init__(self): + super().__init__() + self.before_count = 0 + self.after_count = 0 + + def before_collect_impl(self): + self.before_count += 1 + return "before" + + def after_collect_impl(self): + self.after_count += 1 + return "after" + + t = TestGuard() + t.before_collect() + assert t.before_count == 1 + assert t.after_count == 0 + assert t.collect_num == 1 + t.after_collect() + assert t.before_count == 1 + assert t.after_count == 1 + assert t.collect_num == 1 + # 60>count>0, 不执行 + t.collect_num = 30 + t.before_collect() + t.after_collect() + assert t.before_count == 1 + assert t.after_count == 1 + assert t.collect_num == 31 + # 60>count>0, 不执行 + t.collect_num = 59 + t.before_collect() + t.after_collect() + assert t.before_count == 1 + assert t.after_count == 1 + assert t.collect_num == 60 + t.before_collect() + t.after_collect() + assert t.before_count == 2 + assert t.after_count == 2 + assert t.collect_num == 61 From b653f19055d2ab53f5505eb901c93b188d295819 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:51:36 +0800 Subject: [PATCH 08/21] fix: gpu bugs --- swanlab/data/run/main.py | 4 +-- .../data/run/metadata/hardware/gpu/nvidia.py | 29 ++++++++------- swanlab/data/run/metadata/hardware/type.py | 16 ++++----- .../run/metadata/hardware/gpu/test_nvidia.py | 35 +++++++++++-------- .../data/run/metadata/hardware/test_type.py | 5 +-- 5 files changed, 49 insertions(+), 40 deletions(-) diff --git a/swanlab/data/run/main.py b/swanlab/data/run/main.py index d31a16df..dc4c3c0e 100644 --- a/swanlab/data/run/main.py +++ b/swanlab/data/run/main.py @@ -164,8 +164,8 @@ def monitor_func(): key=key, name=name, column_config=cfg, - # column_class="SYSTEM", - # section_type="SYSTEM", + column_class="SYSTEM", + section_type="SYSTEM", ) return monitor_func diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index e22fc1a9..4e85aab3 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -20,10 +20,6 @@ def get_nvidia_gpu_info() -> HardwareFuncResult: info = {"driver": None, "cores": None, "type": [], "memory": [], "cuda": None} try: pynvml.nvmlInit() - except Exception: # noqa - return None, None - - try: # 获取 NVIDIA 驱动版本信息 nv_driver = pynvml.nvmlSystemGetDriverVersion() if isinstance(nv_driver, bytes): @@ -45,12 +41,14 @@ def get_nvidia_gpu_info() -> HardwareFuncResult: info["type"].append(gpu_name) # 获取 GPU 的总显存, 单位为GB info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3))) - except pynvml.NVMLError: pass + except Exception: # noqa + return None, None finally: pynvml.nvmlShutdown() - return info, None if not info["cores"] else GpuCollector(int(info["cores"])) + count = info["cores"] + return info, None if not count else GpuCollector(count) def get_cuda_version(): @@ -82,7 +80,6 @@ def __init__(self, count: int): self.per_gpu_configs[self.gpu_mem_pct_key].append(mem_pct_config.clone(metric_name=metric_name)) self.per_gpu_configs[self.gpu_temp_key].append(tem_config.clone(metric_name=metric_name)) self.per_gpu_configs[self.gpu_power_key].append(power_config.clone(metric_name=metric_name)) - self.handles.append(pynvml.nvmlDeviceGetHandleByIndex(idx)) def get_gpu_config(self, key: str, idx: int) -> ColumnConfig: """ @@ -90,10 +87,11 @@ def get_gpu_config(self, key: str, idx: int) -> ColumnConfig: """ return self.per_gpu_configs[key][idx] - def get_gpu_mem_pct(self, idx: int, handle) -> HardwareInfo: + def get_gpu_mem_pct(self, idx: int) -> HardwareInfo: """ 获取 GPU 内存使用率 """ + handle = self.handles[idx] mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) mem_pct = mem_info.used / mem_info.total * 100 return { @@ -103,10 +101,11 @@ def get_gpu_mem_pct(self, idx: int, handle) -> HardwareInfo: "config": self.get_gpu_config(self.gpu_mem_pct_key, idx), } - def get_gpu_temp(self, idx: int, handle) -> HardwareInfo: + def get_gpu_temp(self, idx: int) -> HardwareInfo: """ 获取 GPU 温度 """ + handle = self.handles[idx] temp_info = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) return { "key": self.gpu_temp_key.format(idx=idx), @@ -115,10 +114,11 @@ def get_gpu_temp(self, idx: int, handle) -> HardwareInfo: "config": self.get_gpu_config(self.gpu_temp_key, idx), } - def get_gpu_power(self, idx: int, handle) -> HardwareInfo: + def get_gpu_power(self, idx: int) -> HardwareInfo: """ 获取 GPU 功耗 """ + handle = self.handles[idx] # 功耗单位为mW,转换为W power_info = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000 return { @@ -137,9 +137,9 @@ def collect(self) -> HardwareInfoList: # 高频采集下(30s以上),应该在初始化时执行pynvml.nvmlInit(),在最后一次采集时执行pynvml.nvmlShutdown() # 在外部定时任务处,超过10次即变为低频采集,因此需要判断一下 for idx, handle in enumerate(self.handles): - info_list.append(self.get_gpu_mem_pct(idx, handle)) - info_list.append(self.get_gpu_temp(idx, handle)) - info_list.append(self.get_gpu_power(idx, handle)) + info_list.append(self.get_gpu_mem_pct(idx)) + info_list.append(self.get_gpu_temp(idx)) + info_list.append(self.get_gpu_power(idx)) return info_list def __del__(self): @@ -147,8 +147,11 @@ def __del__(self): def before_collect_impl(self): pynvml.nvmlInit() + for i in range(pynvml.nvmlDeviceGetCount()): + self.handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) swanlog.debug("NVIDIA GPU nvml inited.") def after_collect_impl(self): pynvml.nvmlShutdown() + self.handles.clear() swanlog.debug("NVIDIA GPU nvml shutdown.") diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index ac5299d0..8565a811 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -52,12 +52,12 @@ def before_collect(self): self.collect_num += 1 def after_collect(self): - # count=1, 执行一次 - if self.collect_num == 1: - return self.after_collect_impl() - # 61>count>1, 不执行 - elif self.collect_num < 61: + # 60>count>0, 不执行 + if self.collect_num < 60: return + # count=60,执行一次 + elif self.collect_num == 60: + return self.after_collect_impl() else: return self.after_collect_impl() @@ -77,14 +77,14 @@ def collect(self) -> HardwareInfoList: def __call__(self) -> Optional[HardwareInfoList]: try: self.before_collect() - self.collect() - self.after_collect() - return + return self.collect() except NotImplementedError as n: raise n except Exception as e: swanlog.error(f"Hardware info collection failed: {self.__class__.__name__}, {str(e)}") return None + finally: + self.after_collect() # 定义硬件信息执行函数的返回结果 diff --git a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py index 6e659a88..9490937e 100644 --- a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py +++ b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py @@ -4,49 +4,54 @@ @time: 2024/12/5 13:27 @description: 测试NVIDIA GPU信息采集 """ + import pynvml import pytest + from swanlab.data.run.metadata.hardware.gpu.nvidia import GpuCollector + +handles = [] try: pynvml.nvmlInit() count = pynvml.nvmlDeviceGetCount() + handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(count)] except Exception: # noqa count = 0 + @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") def test_get_mem(): - collector = GpuCollector() + collector = GpuCollector(handles) # 获取handle idx = 0 - handle = pynvml.nvmlDeviceGetHandleByIndex(idx) - mem = collector.get_gpu_mem_pct(idx=idx, handle=handle) - assert mem['name'] == "GPU 0 Utilization (%)" - assert mem['config'].y_range == (0, 100) + mem = collector.get_gpu_mem_pct(idx=idx) + assert mem['name'] == "GPU 0 Utilization (%)" + assert mem['config'].y_range == (0, 100) assert mem['config'].metric_name == "GPU 0" - assert 100>=mem['value'] >= 0 + assert 100 >= mem['value'] >= 0 + @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") def test_get_temp(): - collector = GpuCollector() + collector = GpuCollector(handles) # 获取handle idx = 0 - handle = pynvml.nvmlDeviceGetHandleByIndex(idx) - temp = collector.get_gpu_temp(idx=idx, handle=handle) + temp = collector.get_gpu_temp(idx=idx) assert temp['name'] == "GPU 0 Temperature (℃)" assert temp['config'].y_range is None assert temp['config'].metric_name == "GPU 0" - assert temp['value'] >= 0 + assert temp['value'] >= 0 assert temp['config'].metric_name == "GPU 0" + @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") def test_get_power(): - collector = GpuCollector() + collector = GpuCollector(handles) # 获取handle idx = 0 - handle = pynvml.nvmlDeviceGetHandleByIndex(idx) - power = collector.get_gpu_power(idx=idx, handle=handle) + power = collector.get_gpu_power(idx=idx) assert power['name'] == "GPU 0 Power Usage (W)" assert power['config'].y_range is None assert power['config'].metric_name == "GPU 0" - assert power['value'] >= 0 - assert power['config'].metric_name == "GPU 0" \ No newline at end of file + assert power['value'] >= 0 + assert power['config'].metric_name == "GPU 0" diff --git a/test/unit/data/run/metadata/hardware/test_type.py b/test/unit/data/run/metadata/hardware/test_type.py index 1df25fce..1848ea62 100644 --- a/test/unit/data/run/metadata/hardware/test_type.py +++ b/test/unit/data/run/metadata/hardware/test_type.py @@ -47,14 +47,14 @@ def after_collect_impl(self): assert t.collect_num == 1 t.after_collect() assert t.before_count == 1 - assert t.after_count == 1 + assert t.after_count == 0 assert t.collect_num == 1 # 60>count>0, 不执行 t.collect_num = 30 t.before_collect() t.after_collect() assert t.before_count == 1 - assert t.after_count == 1 + assert t.after_count == 0 assert t.collect_num == 31 # 60>count>0, 不执行 t.collect_num = 59 @@ -63,6 +63,7 @@ def after_collect_impl(self): assert t.before_count == 1 assert t.after_count == 1 assert t.collect_num == 60 + # count=60,不执行,但是after执行 t.before_collect() t.after_collect() assert t.before_count == 2 From 8d077000054400d0357bfeaece85488ef7854850 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Wed, 11 Dec 2024 21:29:02 +0800 Subject: [PATCH 09/21] refactor: hardware config --- requirements.txt | 2 +- .../data/run/metadata/hardware/gpu/nvidia.py | 16 ++- .../data/run/metadata/hardware/soc/apple.py | 2 +- swanlab/data/run/metadata/hardware/type.py | 51 ++++++- swanlab/data/run/metadata/hardware/utils.py | 134 +++++++++++------- .../run/metadata/hardware/gpu/test_nvidia.py | 8 +- .../data/run/metadata/hardware/test_utils.py | 6 +- 7 files changed, 148 insertions(+), 71 deletions(-) diff --git a/requirements.txt b/requirements.txt index 54756343..e32a1f41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -swankit==0.1.2b5 +swankit==0.1.2b6 swanboard==0.1.7b1 cos-python-sdk-v5 urllib3>=1.26.0 diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index 4e85aab3..b854868b 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -11,13 +11,17 @@ from swanlab.log import swanlog from ..type import HardwareFuncResult, HardwareCollector, HardwareInfoList, HardwareInfo -from ..utils import generate_key, ColumnConfig, random_index +from ..utils import generate_key, HardwareConfig, random_index def get_nvidia_gpu_info() -> HardwareFuncResult: """获取 GPU 信息""" info = {"driver": None, "cores": None, "type": [], "memory": [], "cuda": None} + try: + pynvml.nvmlInit() + except Exception: # noqa + return None, None try: pynvml.nvmlInit() # 获取 NVIDIA 驱动版本信息 @@ -43,8 +47,6 @@ def get_nvidia_gpu_info() -> HardwareFuncResult: info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3))) except pynvml.NVMLError: pass - except Exception: # noqa - return None, None finally: pynvml.nvmlShutdown() count = info["cores"] @@ -68,11 +70,11 @@ class GpuCollector(HardwareCollector): def __init__(self, count: int): super().__init__() self.gpu_mem_pct_key = generate_key("gpu.{idx}.mem.ptc") - mem_pct_config = ColumnConfig(y_range=(0, 100), chart_name="GPU Utilization (%)", chart_index=random_index()) + mem_pct_config = HardwareConfig(y_range=(0, 100), chart_name="GPU Utilization (%)", chart_index=random_index()) self.gpu_temp_key = generate_key("gpu.{idx}.temp") - tem_config = ColumnConfig(chart_name="GPU Temperature (℃)", chart_index=random_index()) + tem_config = HardwareConfig(chart_name="GPU Temperature (℃)", chart_index=random_index()) self.gpu_power_key = generate_key("gpu.{idx}.power") - power_config = ColumnConfig(chart_name="GPU Power Usage (W)", chart_index=random_index()) + power_config = HardwareConfig(chart_name="GPU Power Usage (W)", chart_index=random_index()) self.per_gpu_configs = {self.gpu_mem_pct_key: [], self.gpu_temp_key: [], self.gpu_power_key: []} self.handles = [] for idx in range(count): @@ -81,7 +83,7 @@ def __init__(self, count: int): self.per_gpu_configs[self.gpu_temp_key].append(tem_config.clone(metric_name=metric_name)) self.per_gpu_configs[self.gpu_power_key].append(power_config.clone(metric_name=metric_name)) - def get_gpu_config(self, key: str, idx: int) -> ColumnConfig: + def get_gpu_config(self, key: str, idx: int) -> HardwareConfig: """ 获取 某个GPU的某个配置信息 """ diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index 0a5d57af..74948a6f 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -12,7 +12,7 @@ import psutil -from ..type import HardwareFuncResult, HardwareCollector as H, HardwareInfoList +from ..type import HardwareFuncResult, HardwareInfoList, HardwareCollector as H from ..utils import CpuCollector as C, MemoryCollector as M diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index 8565a811..b89d142d 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -8,8 +8,10 @@ from abc import ABC, abstractmethod from typing import TypedDict, Tuple, Optional, Any, List, Union +from swankit.callback import YRange from swankit.callback.models import ColumnConfig +from swanlab.data.run.namer import generate_colors from swanlab.log import swanlog @@ -28,6 +30,54 @@ class HardwareInfo(TypedDict): HardwareInfoList = List[HardwareInfo] +class HardwareConfig(ColumnConfig): + """ + 继承自ColumnConfig的硬件配置类 + 每clone一次,metric_color都会自动生成一个 + 如果不指定,初次生成的metric_color为None + """ + + def __init__(self, y_range=None, chart_name=None, chart_index=None, metric_name=None, metric_color=None): + self.__cloned = 0 + super().__init__( + y_range=y_range, + chart_name=chart_name, + chart_index=chart_index, + metric_name=metric_name, + metric_color=metric_color, + ) + + def clone( + self, + y_range: YRange = None, + chart_name: Optional[str] = None, + chart_index: Optional[str] = None, + metric_name: Optional[str] = None, + metric_color: Optional[Tuple[str, str]] = None, + ): + """ + 重写clone方法,每次clone都会生成一个新的metric_color + :param y_range: y轴范围 + :param chart_name: 图表名称 + :param chart_index: 图表索引 + :param metric_name: 指标名称 + :param metric_color: 指标颜色 + :return: 新的HardwareConfig对象 + """ + try: + if metric_color is None: + metric_color = generate_colors(self.__cloned) + return HardwareConfig( + y_range=y_range if y_range is not None else self.y_range, + chart_name=chart_name if chart_name is not None else self.chart_name, + metric_name=metric_name if metric_name is not None else self.metric_name, + chart_index=chart_index if chart_index is not None else self.chart_index, + metric_color=metric_color, + ) + finally: + self.__cloned += 1 + + class CollectGuard: """ 采集守卫,在采集任务执行前后可以选择执行一些操作 @@ -69,7 +119,6 @@ def after_collect_impl(self): class HardwareCollector(CollectGuard, ABC): - @abstractmethod def collect(self) -> HardwareInfoList: pass diff --git a/swanlab/data/run/metadata/hardware/utils.py b/swanlab/data/run/metadata/hardware/utils.py index 137e0841..734e8c55 100644 --- a/swanlab/data/run/metadata/hardware/utils.py +++ b/swanlab/data/run/metadata/hardware/utils.py @@ -8,9 +8,8 @@ import random import psutil -from swankit.callback.models import ColumnConfig -from .type import HardwareInfo, HardwareInfoList +from .type import HardwareInfo, HardwareInfoList, HardwareConfig ALPHABET = "abcdefghijklmnopqrstuvwxyz0123456789" @@ -22,11 +21,35 @@ def random_index(length: int = 8) -> str: return "".join(random.choices(ALPHABET, k=length)) -def generate_key(suffix: str, length: int = 4) -> str: +def generate_key(suffix: str) -> str: """ 生成key,用于标识系统列,避免与用户输入的key冲突 """ - return "".join(random.choices(ALPHABET, k=length)) + "." + suffix + return "__swanlab__." + suffix + + +# CPU 使用率 +CPU_PCT_KEY = generate_key("cpu.pct") +CPU_PCT_CONFIG = HardwareConfig( + y_range=(0, 100), + chart_name="CPU Utilization (%)", +).clone() + +# CPU 核心使用率 +CPU_INDEX_PER_KEY = generate_key("cpu.{idx}.pct") +CPU_INDEX_PER_CONFIG = HardwareConfig( + y_range=(0, 100), + chart_name="CPU Utilization (per core) (%)", + chart_index=random_index(), +) +CPU_INDEX_PER_CONFIGS = [CPU_INDEX_PER_CONFIG.clone(metric_name=f"CPU {idx}") for idx in range(psutil.cpu_count())] + +# CPU 线程数 +CPU_THDS_KEY = generate_key("cpu.thds") +CPU_THDS_CONFIG = HardwareConfig( + y_range=(0, None), + chart_name="Process CPU Threads", +).clone() class CpuCollector: @@ -34,110 +57,119 @@ class CpuCollector: cpu采集基类,为子类赋予cpu采集的能力 """ - CPU_CONFIG = ColumnConfig(y_range=(0, 100), chart_name="CPU Utilization (%)") - PER_CPU_CONFIG = ColumnConfig(y_range=(0, 100), chart_name="CPU Utilization (per core) (%)") - THDS_CONFIG = ColumnConfig(y_range=(0, None), chart_name="Process CPU Threads") + __per_cpu_configs = [] - per_cpu_configs = [] - # 随机生成一个index,用于标识图表 - per_cpu_usage_chart_index = random_index() - cpu_usage_key = generate_key("cpu.pct") - per_cpu_usage_key = generate_key("cpu.{idx}.pct") - proc_thds_key = generate_key("cpu.thds") - - def get_cpu_usage(self) -> HardwareInfo: + @staticmethod + def get_cpu_usage() -> HardwareInfo: """ 获取当前 CPU 使用率 """ return { - "key": self.cpu_usage_key, + "key": CPU_PCT_KEY, "name": "CPU Utilization (%)", "value": psutil.cpu_percent(interval=1), - "config": self.CPU_CONFIG, + "config": CPU_PCT_CONFIG, } - def get_per_cpu_usage(self) -> HardwareInfoList: + @staticmethod + def get_per_cpu_usage() -> HardwareInfoList: """ 获取每个 CPU 核心的使用率 """ - per_cpu_usage = psutil.cpu_percent(interval=1, percpu=True) + per_cpu_usages = psutil.cpu_percent(interval=1, percpu=True) result: HardwareInfoList = [] - # 避免每次调用都创建新的配置 - if len(self.per_cpu_configs) != len(per_cpu_usage): - self.per_cpu_configs = [ - self.PER_CPU_CONFIG.clone(metric_name=f"CPU {idx}", chart_index=self.per_cpu_usage_chart_index) - for idx in range(len(per_cpu_usage)) - ] - for idx, value in enumerate(per_cpu_usage): + for idx, value in enumerate(per_cpu_usages): info: HardwareInfo = { - "key": self.per_cpu_usage_key.format(idx=idx), + "key": CPU_INDEX_PER_KEY.format(idx=idx), "name": f"CPU {idx} Utilization (%)", "value": value, - "config": self.per_cpu_configs[idx], + "config": CPU_INDEX_PER_CONFIGS[idx], } result.append(info) return result - def get_cur_proc_thds_num(self, proc: psutil.Process) -> HardwareInfo: + @staticmethod + def get_cur_proc_thds_num(proc: psutil.Process) -> HardwareInfo: """ 获取当前进程的线程数 """ return { - "key": self.proc_thds_key, + "key": CPU_THDS_KEY, "name": "Process CPU Threads", "value": proc.num_threads(), - "config": self.THDS_CONFIG, + "config": CPU_THDS_CONFIG, } +# 内存使用率 +MEM_PCT_KEY = generate_key("mem.pct") +MEM_PCT_CONFIG = HardwareConfig( + y_range=(0, 100), +).clone() + +# 进程内存使用情况 +MEM_PROC_KEY = generate_key("mem.proc") +MEM_PROC_CONFIG = HardwareConfig( + y_range=(0, None), + chart_name="Process Memory In Use (non-swap) (MB)", +).clone() + +# 进程内存使用率 +MEM_PROC_PCT_KEY = generate_key("mem.proc.pct") +MEM_PROC_PCT_CONFIG = HardwareConfig( + y_range=(0, 100), + chart_name="Process Memory Utilization (%)", +).clone() + +# 进程内存可用情况 +PROC_MEM_AVAIL_KEY = generate_key("mem.proc.avail") +PROC_MEM_AVAIL_CONFIG = HardwareConfig( + y_range=(0, None), + chart_name="Process Memory Available (MB)", +).clone() + + class MemoryCollector: """ 内存采集基类,为子类赋予内存采集的能力 """ - MB = 1024 * 1024 - MEM_CONFIG = ColumnConfig(y_range=(0, 100)) - PROC_MEM_PCT_CONFIG = ColumnConfig(y_range=(0, 100)) - - mem_usage_key = generate_key("mem.pct") - mem_proc_key = generate_key("mem.proc") - mem_proc_pct_key = generate_key("mem.proc.pct") - mem_proc_avail_key = generate_key("mem.proc.avail") - - def get_mem_usage(self) -> HardwareInfo: + @staticmethod + def get_mem_usage() -> HardwareInfo: """ 获取当前系统内存使用率 """ return { - "key": self.mem_usage_key, + "key": MEM_PCT_KEY, "name": "System Memory Utilization (%)", "value": psutil.virtual_memory().percent, - "config": self.MEM_CONFIG, + "config": MEM_PCT_CONFIG, } - def get_cur_proc_mem(self, proc: psutil.Process) -> HardwareInfoList: + @staticmethod + def get_cur_proc_mem(proc: psutil.Process) -> HardwareInfoList: """ 获取当前进程的内存使用情况 """ mem_info = proc.memory_info() virtual_memory = psutil.virtual_memory() mem_proc: HardwareInfo = { - "key": self.mem_proc_key, + "key": MEM_PROC_KEY, "name": "Process Memory In Use (non-swap) (MB)", - "value": mem_info.rss / self.MB, + "value": mem_info.rss / 1024 / 1024, "config": None, } mem_proc_pct: HardwareInfo = { - "key": self.mem_proc_pct_key, + "key": MEM_PROC_PCT_KEY, "name": "Process Memory Utilization (%)", "value": proc.memory_percent(), - "config": self.PROC_MEM_PCT_CONFIG, + "config": MEM_PROC_PCT_CONFIG, } mem_proc_avail: HardwareInfo = { - "key": self.mem_proc_avail_key, + "key": PROC_MEM_AVAIL_KEY, "name": "Process Memory Available (MB)", - "value": virtual_memory.available / self.MB, - "config": None, + "value": virtual_memory.available / 1024 / 1024, + "config": PROC_MEM_AVAIL_CONFIG, } return [mem_proc, mem_proc_pct, mem_proc_avail] diff --git a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py index 9490937e..92e16c78 100644 --- a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py +++ b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py @@ -10,18 +10,16 @@ from swanlab.data.run.metadata.hardware.gpu.nvidia import GpuCollector -handles = [] try: pynvml.nvmlInit() count = pynvml.nvmlDeviceGetCount() - handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(count)] except Exception: # noqa count = 0 @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") def test_get_mem(): - collector = GpuCollector(handles) + collector = GpuCollector(count) # 获取handle idx = 0 mem = collector.get_gpu_mem_pct(idx=idx) @@ -33,7 +31,7 @@ def test_get_mem(): @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") def test_get_temp(): - collector = GpuCollector(handles) + collector = GpuCollector(count) # 获取handle idx = 0 temp = collector.get_gpu_temp(idx=idx) @@ -46,7 +44,7 @@ def test_get_temp(): @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") def test_get_power(): - collector = GpuCollector(handles) + collector = GpuCollector(count) # 获取handle idx = 0 power = collector.get_gpu_power(idx=idx) diff --git a/test/unit/data/run/metadata/hardware/test_utils.py b/test/unit/data/run/metadata/hardware/test_utils.py index 5d6f8890..327768e1 100644 --- a/test/unit/data/run/metadata/hardware/test_utils.py +++ b/test/unit/data/run/metadata/hardware/test_utils.py @@ -19,11 +19,7 @@ def test_random_index(): def test_generate_key(): s = generate_key("test") - assert len(s) == 9 - assert s.endswith(".test") - s = generate_key("test", 10) - assert len(s) == 15 - assert s.endswith(".test") + assert s == "__swanlab__.test" def test_cpu_usage(): From fabdca33a49921dbcb1b330222b9ec2ac295bb9e Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:20:39 +0800 Subject: [PATCH 10/21] feat: ascend npu --- ...41\346\201\257\351\207\207\351\233\206.md" | 8 +- .../data/run/metadata/hardware/gpu/nvidia.py | 10 +- .../data/run/metadata/hardware/npu/ascend.py | 93 ++++++++++++++++++- 3 files changed, 101 insertions(+), 10 deletions(-) diff --git "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" index 83e76c13..816fbc82 100644 --- "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" +++ "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" @@ -128,12 +128,14 @@ class MonitorCron: 如果swanlab识别到Ascend NPU,swanlab会采集Ascend NPU的对应指标,他们的标签类似`npu.{npu_index}...`。 同指标不同编号的NPU将自动在一个指标图表中展示。 +根据Ascend NPU的[官方文档](https://support.huawei.com/enterprise/zh/doc/EDOC1100388864/8c5e18a7), +唯一定位一块计算芯片需要同时知道NPU ID和Chip ID,因此对于Ascend NPU而言,`npu_index = f{npu_id}-{chip_id}`。 -#### HBM Usage Rate (%) +#### NPU Utilization (%) -表示每个NPU的HBM利用率百分比,swanlab为它打了一个 `npu.{npu_index}.hbm.ptc` 标签。 +表示每个NPU的HBM利用率百分比,swanlab为它打了一个 `npu.{npu_index}.mem.ptc` 标签。 -#### NPU Temperature +#### NPU Temperature (℃) 表示每个NPU的摄氏温度,swanlab为它打了一个 `npu.{npu_index}.temp` 标签。 diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index b854868b..c1ce05bf 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -134,15 +134,15 @@ def collect(self) -> HardwareInfoList: """ 采集信息 """ - info_list: HardwareInfoList = [] + result: HardwareInfoList = [] # 低频采集下(30s以下),应该每次采集时都执行pynvml.nvmlInit() # 高频采集下(30s以上),应该在初始化时执行pynvml.nvmlInit(),在最后一次采集时执行pynvml.nvmlShutdown() # 在外部定时任务处,超过10次即变为低频采集,因此需要判断一下 for idx, handle in enumerate(self.handles): - info_list.append(self.get_gpu_mem_pct(idx)) - info_list.append(self.get_gpu_temp(idx)) - info_list.append(self.get_gpu_power(idx)) - return info_list + result.append(self.get_gpu_mem_pct(idx)) + result.append(self.get_gpu_temp(idx)) + result.append(self.get_gpu_power(idx)) + return result def __del__(self): pynvml.nvmlShutdown() diff --git a/swanlab/data/run/metadata/hardware/npu/ascend.py b/swanlab/data/run/metadata/hardware/npu/ascend.py index aa16ec82..515a47e4 100644 --- a/swanlab/data/run/metadata/hardware/npu/ascend.py +++ b/swanlab/data/run/metadata/hardware/npu/ascend.py @@ -5,11 +5,13 @@ @description: 华为昇腾NPU信息采集 """ +import math import os import platform import subprocess -from swanlab.data.run.metadata.hardware.type import HardwareFuncResult +from ..type import HardwareFuncResult, HardwareInfoList, HardwareConfig, HardwareInfo, HardwareCollector as H +from ..utils import generate_key, random_index def get_ascend_npu_info() -> HardwareFuncResult: @@ -25,6 +27,7 @@ def get_ascend_npu_info() -> HardwareFuncResult: if not list(filter(lambda x: x.startswith("davinci"), os.listdir("/dev"))): return None, None info = {"driver": None, "npu": None} + collector = None try: # 获取NPU驱动版本 info["driver"] = get_version() @@ -39,10 +42,11 @@ def get_ascend_npu_info() -> HardwareFuncResult: if npu_id not in info["npu"]: info["npu"][npu_id] = {} info["npu"][npu_id][chip_id] = {**chip_info, "usage": usage} + collector = AscendCollector(npu_map) except Exception: # noqa if all(v is None for v in info.values()): return None, None - return info, None + return info, collector def get_version() -> str: @@ -96,3 +100,88 @@ def get_chip_usage(npu_id: str, chip_id: str): usage["hbm"] = str(round(int(hbm) / 1024)) break return usage + + +class AscendCollector(H): + def __init__(self, npu_map): + super().__init__() + self.npu_map = npu_map + # HBM Usage Rate(%) + self.hbm_rate_key = generate_key("npu.{npu_index}.mem.ptc") + hbm_rate_config = HardwareConfig( + y_range=(0, 100), + chart_index=random_index(), + chart_name="NPU Utilization (%)", + ) + self.per_hbm_configs = {} + # NPU Temperature (℃) + self.temp_key = generate_key("npu.{npu_index}.temp") + temp_config = HardwareConfig( + y_range=(0, None), + chart_index=random_index(), + chart_name="NPU Temperature (℃)", + ) + self.per_temp_configs = {} + + for npu_id in npu_map: + for chip_id in npu_map[npu_id]: + metric_name = f"NPU {npu_id}-{chip_id}" + self.per_hbm_configs[metric_name] = hbm_rate_config.clone(metric_name=metric_name) + self.per_temp_configs[metric_name] = temp_config.clone(metric_name=metric_name) + + def collect(self) -> HardwareInfoList: + result: HardwareInfoList = [] + for npu_id in self.npu_map: + for chip_id in self.npu_map[npu_id]: + result.append(self.get_hbm_rate(npu_id, chip_id)) + result.append(self.get_chip_temp(npu_id, chip_id)) + return result + + def get_hbm_rate(self, npu_id: str, chip_id: str) -> HardwareInfo: + """ + 获取指定NPU设备的芯片HBM的用量信息 + """ + output = subprocess.run( + ["npu-smi", "info", "-t", "usages", "-i", npu_id, "-c", chip_id], + capture_output=True, + text=True, + ).stdout + rate = math.nan + for line in output.split("\n"): + if "hbm usage rate" in line.lower(): + line = line.split(":") + # HBM Capacity的值在最后一个 + hbm = line[-1].strip() + if hbm.isdigit(): + rate = float(hbm) + break + _id, metric_name = self.get_label(npu_id, chip_id) + return { + "key": self.hbm_rate_key.format(npu_index=_id), + "name": f"{metric_name} Utilization (%)", + "value": rate, + "config": self.per_hbm_configs[metric_name], + } + + def get_chip_temp(self, npu_id: str, chip_id: str) -> HardwareInfo: + """ + 获取芯片温度 + """ + output = subprocess.run( + ["npu-smi", "info", "-t", "temp", "-i", npu_id, "-c", chip_id], + capture_output=True, + text=True, + ).stdout.strip() + temp = float(output.split(":")[-1].strip()) + _id, metric_name = self.get_label(npu_id, chip_id) + return { + "key": self.temp_key.format(npu_index=_id), + "name": f"{metric_name} Temperature (℃)", + "value": temp, + "config": self.per_temp_configs[metric_name], + } + + @staticmethod + def get_label(npu_id: str, chip_id: str): + _id = f"{npu_id}-{chip_id}" + return _id, f"NPU {_id}" From 107dee3b1ea49a0dc167010214ebe26222465bdf Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:01:41 +0800 Subject: [PATCH 11/21] add: test --- .../data/run/metadata/hardware/gpu/nvidia.py | 7 +- swanlab/data/run/metadata/hardware/type.py | 1 - .../run/metadata/hardware/gpu/test_nvidia.py | 72 ++++++++++++------- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index c1ce05bf..fa9edcc2 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -23,7 +23,6 @@ def get_nvidia_gpu_info() -> HardwareFuncResult: except Exception: # noqa return None, None try: - pynvml.nvmlInit() # 获取 NVIDIA 驱动版本信息 nv_driver = pynvml.nvmlSystemGetDriverVersion() if isinstance(nv_driver, bytes): @@ -135,9 +134,6 @@ def collect(self) -> HardwareInfoList: 采集信息 """ result: HardwareInfoList = [] - # 低频采集下(30s以下),应该每次采集时都执行pynvml.nvmlInit() - # 高频采集下(30s以上),应该在初始化时执行pynvml.nvmlInit(),在最后一次采集时执行pynvml.nvmlShutdown() - # 在外部定时任务处,超过10次即变为低频采集,因此需要判断一下 for idx, handle in enumerate(self.handles): result.append(self.get_gpu_mem_pct(idx)) result.append(self.get_gpu_temp(idx)) @@ -148,6 +144,9 @@ def __del__(self): pynvml.nvmlShutdown() def before_collect_impl(self): + # 低频采集下(30s以下),应该每次采集时都执行pynvml.nvmlInit() + # 高频采集下(30s以上),应该在初始化时执行pynvml.nvmlInit(),在最后一次采集时执行pynvml.nvmlShutdown() + # 在外部定时任务处,超过10次即变为低频采集,因此需要判断一下 pynvml.nvmlInit() for i in range(pynvml.nvmlDeviceGetCount()): self.handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index b89d142d..79b9e88f 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -125,7 +125,6 @@ def collect(self) -> HardwareInfoList: def __call__(self) -> Optional[HardwareInfoList]: try: - self.before_collect() return self.collect() except NotImplementedError as n: raise n diff --git a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py index 92e16c78..69ce68ed 100644 --- a/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py +++ b/test/unit/data/run/metadata/hardware/gpu/test_nvidia.py @@ -18,38 +18,56 @@ @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") -def test_get_mem(): +def test_before_impl(): collector = GpuCollector(count) - # 获取handle - idx = 0 - mem = collector.get_gpu_mem_pct(idx=idx) - assert mem['name'] == "GPU 0 Utilization (%)" - assert mem['config'].y_range == (0, 100) - assert mem['config'].metric_name == "GPU 0" - assert 100 >= mem['value'] >= 0 + collector.before_collect_impl() + assert len(collector.handles) == count @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") -def test_get_temp(): +def test_after_impl(): collector = GpuCollector(count) - # 获取handle - idx = 0 - temp = collector.get_gpu_temp(idx=idx) - assert temp['name'] == "GPU 0 Temperature (℃)" - assert temp['config'].y_range is None - assert temp['config'].metric_name == "GPU 0" - assert temp['value'] >= 0 - assert temp['config'].metric_name == "GPU 0" + collector.after_collect_impl() + assert len(collector.handles) == 0 -@pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") -def test_get_power(): +class TestGpuCollector: collector = GpuCollector(count) - # 获取handle - idx = 0 - power = collector.get_gpu_power(idx=idx) - assert power['name'] == "GPU 0 Power Usage (W)" - assert power['config'].y_range is None - assert power['config'].metric_name == "GPU 0" - assert power['value'] >= 0 - assert power['config'].metric_name == "GPU 0" + + def setup_class(self): + self.collector.before_collect_impl() + + def teardown_class(self): + self.collector.after_collect_impl() + + @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") + def test_get_mem(self): + # 获取handle + idx = 0 + mem = self.collector.get_gpu_mem_pct(idx=idx) + assert mem['name'] == "GPU 0 Utilization (%)" + assert mem['config'].y_range == (0, 100) + assert mem['config'].metric_name == "GPU 0" + assert 100 >= mem['value'] >= 0 + + @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") + def test_get_temp(self): + # 获取handle + idx = 0 + temp = self.collector.get_gpu_temp(idx=idx) + assert temp['name'] == "GPU 0 Temperature (℃)" + assert temp['config'].y_range is None + assert temp['config'].metric_name == "GPU 0" + assert temp['value'] >= 0 + assert temp['config'].metric_name == "GPU 0" + + @pytest.mark.skipif(count == 0, reason="No NVIDIA GPU found") + def test_get_power(self): + # 获取handle + idx = 0 + power = self.collector.get_gpu_power(idx=idx) + assert power['name'] == "GPU 0 Power Usage (W)" + assert power['config'].y_range is None + assert power['config'].metric_name == "GPU 0" + assert power['value'] >= 0 + assert power['config'].metric_name == "GPU 0" From a7ac41e19a20e80eae493c2ddfd8181934e33332 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:20:12 +0800 Subject: [PATCH 12/21] opt: column info --- swanlab/api/upload/model.py | 24 ++++++++++++++---------- swanlab/data/callback_cloud.py | 1 + swanlab/data/run/main.py | 2 +- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/swanlab/api/upload/model.py b/swanlab/api/upload/model.py index 5b2c16bd..15ed9a21 100644 --- a/swanlab/api/upload/model.py +++ b/swanlab/api/upload/model.py @@ -11,7 +11,7 @@ from enum import Enum from typing import List, Optional -from swankit.callback.models import ColumnClass +from swankit.callback.models import ColumnClass, ColumnConfig from swanlab.data.modules import MediaBuffer @@ -27,31 +27,33 @@ def __init__( name: Optional[str], cls: ColumnClass, typ: str, + config: Optional[ColumnConfig], section_name: Optional[str], section_type: Optional[str], error: dict = None, ): """ - Args: - key: 键 - name: 键的名称 - cls: 键的类别 - typ: 键的类型 - section_name: 键所在的section的名称 - section_type: 键所在的section的类型 - error: 错误信息 + key: 键 + name: 键的名称 + cls: 键的类别 + typ: 键的类型 + config: 键的配置 + section_name: 键所在的section的名称 + section_type: 键所在的section的类型 + error: 错误信息 """ self.key = key self.name = name self.cls = cls self.typ = typ + self.config = config self.section_name = section_name self.section_type = section_type self.error = error def to_dict(self): """ - 序列化为Dict + 序列化为Dict,传递给后端 """ d = { "class": self.cls, @@ -70,6 +72,8 @@ def to_dict(self): d.pop("sectionName") if self.section_type is None: d.pop("sectionType") + # 将额外的图表配置信息加入 + return d diff --git a/swanlab/data/callback_cloud.py b/swanlab/data/callback_cloud.py index 7b0d5b6a..a8e65423 100644 --- a/swanlab/data/callback_cloud.py +++ b/swanlab/data/callback_cloud.py @@ -284,6 +284,7 @@ def on_column_create(self, column_info: ColumnInfo): key=column_info.key, name=column_info.name, cls=column_info.cls, + config=column_info.config, typ=column_info.chart_type.value.column_type, section_name=section_name, section_type=column_info.section_type, diff --git a/swanlab/data/run/main.py b/swanlab/data/run/main.py index dc4c3c0e..84fddd57 100644 --- a/swanlab/data/run/main.py +++ b/swanlab/data/run/main.py @@ -158,7 +158,7 @@ def monitor_func(): info['value'], info['config'], ) - v = DataWrapper(key, [Line(value)]) + v = DataWrapper(key, [Line(value)], reference="TIME") self.__exp.add( data=v, key=key, From efe44a6a9e1659eba29f156f7a654a5e690452d0 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:17:50 +0800 Subject: [PATCH 13/21] feat: create system column --- swanlab/api/upload/model.py | 13 ++++++++++++- swanlab/data/run/metadata/hardware/cpu.py | 6 ++++-- swanlab/data/run/metadata/hardware/memory.py | 3 +++ swanlab/data/run/metadata/hardware/soc/apple.py | 4 ++-- swanlab/data/run/metadata/hardware/type.py | 1 + swanlab/data/run/metadata/hardware/utils.py | 6 +++--- 6 files changed, 25 insertions(+), 8 deletions(-) diff --git a/swanlab/api/upload/model.py b/swanlab/api/upload/model.py index 15ed9a21..fc69983a 100644 --- a/swanlab/api/upload/model.py +++ b/swanlab/api/upload/model.py @@ -72,8 +72,19 @@ def to_dict(self): d.pop("sectionName") if self.section_type is None: d.pop("sectionType") + if self.config is None: + return d # 将额外的图表配置信息加入 - + if self.config.y_range is not None: + d["yRange"] = self.config.y_range + if self.config.chart_name is not None: + d["chartName"] = self.config.chart_name + if self.config.chart_index is not None: + d["chartIndex"] = self.config.chart_index + if self.config.metric_name is not None: + d["metricName"] = self.config.metric_name + if self.config.metric_color is not None: + d["metricColors"] = self.config.metric_color return d diff --git a/swanlab/data/run/metadata/hardware/cpu.py b/swanlab/data/run/metadata/hardware/cpu.py index 7fd9e1eb..56075e4d 100644 --- a/swanlab/data/run/metadata/hardware/cpu.py +++ b/swanlab/data/run/metadata/hardware/cpu.py @@ -10,6 +10,7 @@ import subprocess import psutil +from swankit.env import is_macos from swanlab.data.run.metadata.hardware.type import HardwareFuncResult, HardwareCollector, HardwareInfoList from .utils import CpuCollector as C @@ -17,6 +18,8 @@ def get_cpu_info() -> HardwareFuncResult: """获取 CPU 信息""" + if is_macos(): + return None, None info = {"brand": None, "cores": None} # 获取 CPU 品牌, 根据不同操作系统调用不同的函数 @@ -26,7 +29,6 @@ def get_cpu_info() -> HardwareFuncResult: info["brand"] = get_cpu_brand_linux() else: # 其他情况,暂时不支持 - # 苹果芯片单独处理 return None, None try: # 获取 CPU 核心数 @@ -34,7 +36,7 @@ def get_cpu_info() -> HardwareFuncResult: except Exception: # noqa pass - return info, None + return info, CpuCollector() def get_cpu_brand_windows(): diff --git a/swanlab/data/run/metadata/hardware/memory.py b/swanlab/data/run/metadata/hardware/memory.py index ae03830d..24a020d9 100644 --- a/swanlab/data/run/metadata/hardware/memory.py +++ b/swanlab/data/run/metadata/hardware/memory.py @@ -8,6 +8,7 @@ from typing import List import psutil +from swankit.env import is_macos from .type import HardwareFuncResult, HardwareCollector, HardwareInfo from .utils import MemoryCollector as M @@ -15,6 +16,8 @@ def get_memory_size() -> HardwareFuncResult: """获取内存大小""" + if is_macos(): + return None, None try: # 获取系统总内存大小 total = psutil.virtual_memory().total diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index 74948a6f..1da4797e 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -7,17 +7,17 @@ import json import multiprocessing -import platform import subprocess import psutil +from swankit.env import is_macos from ..type import HardwareFuncResult, HardwareInfoList, HardwareCollector as H from ..utils import CpuCollector as C, MemoryCollector as M def get_apple_chip_info() -> HardwareFuncResult: - if "mac" not in platform.platform().lower(): + if not is_macos(): return None, None info = {"cpu": None, "gpu": None, "memory": None, "type": None} diff --git a/swanlab/data/run/metadata/hardware/type.py b/swanlab/data/run/metadata/hardware/type.py index 79b9e88f..b89d142d 100644 --- a/swanlab/data/run/metadata/hardware/type.py +++ b/swanlab/data/run/metadata/hardware/type.py @@ -125,6 +125,7 @@ def collect(self) -> HardwareInfoList: def __call__(self) -> Optional[HardwareInfoList]: try: + self.before_collect() return self.collect() except NotImplementedError as n: raise n diff --git a/swanlab/data/run/metadata/hardware/utils.py b/swanlab/data/run/metadata/hardware/utils.py index 734e8c55..838d969d 100644 --- a/swanlab/data/run/metadata/hardware/utils.py +++ b/swanlab/data/run/metadata/hardware/utils.py @@ -125,7 +125,7 @@ def get_cur_proc_thds_num(proc: psutil.Process) -> HardwareInfo: PROC_MEM_AVAIL_KEY = generate_key("mem.proc.avail") PROC_MEM_AVAIL_CONFIG = HardwareConfig( y_range=(0, None), - chart_name="Process Memory Available (MB)", + chart_name="Process Memory Available (non-swap) (MB)", ).clone() @@ -157,7 +157,7 @@ def get_cur_proc_mem(proc: psutil.Process) -> HardwareInfoList: "key": MEM_PROC_KEY, "name": "Process Memory In Use (non-swap) (MB)", "value": mem_info.rss / 1024 / 1024, - "config": None, + "config": MEM_PROC_CONFIG, } mem_proc_pct: HardwareInfo = { "key": MEM_PROC_PCT_KEY, @@ -167,7 +167,7 @@ def get_cur_proc_mem(proc: psutil.Process) -> HardwareInfoList: } mem_proc_avail: HardwareInfo = { "key": PROC_MEM_AVAIL_KEY, - "name": "Process Memory Available (MB)", + "name": "Process Memory Available (non-swap) (MB)", "value": virtual_memory.available / 1024 / 1024, "config": PROC_MEM_AVAIL_CONFIG, } From da774a446e073ce30533f0b54a88d68b4c2f784e Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:22:18 +0800 Subject: [PATCH 14/21] revert: interval --- swanlab/data/run/helper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/swanlab/data/run/helper.py b/swanlab/data/run/helper.py index f49adddf..8f531280 100644 --- a/swanlab/data/run/helper.py +++ b/swanlab/data/run/helper.py @@ -151,11 +151,11 @@ def sleep_time(self): # 采集10次到50次,每次间隔30秒 # 采集50次以上,每次间隔60秒 if self.count < 10: - return 1 + return 10 elif self.count < 50: - return 3 + return 30 else: - return 6 + return 60 def check_log_level(log_level: Optional[str]) -> str: From 1c888896ed41e8d54fed72ac9fd9820ebd2c0bce Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:29:48 +0800 Subject: [PATCH 15/21] fix: apple intel cpu --- swanlab/data/run/metadata/hardware/soc/apple.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index 1da4797e..2e2d2d7d 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -20,15 +20,19 @@ def get_apple_chip_info() -> HardwareFuncResult: if not is_macos(): return None, None info = {"cpu": None, "gpu": None, "memory": None, "type": None} - # 使用system_profiler命令以JSON格式获取GPU信息 try: result = subprocess.run(["system_profiler", "SPHardwareDataType", "-json"], capture_output=True, text=True) - gpu_name = json.loads(result.stdout)["SPHardwareDataType"][0]["chip_type"] + chip_name = json.loads(result.stdout)["SPHardwareDataType"][0].get("chip_name", None) + # 早期intel芯片的机器 + if chip_name is None: + chip_name = json.loads(result.stdout)["SPHardwareDataType"][0].get("cpu_type", None) + if chip_name is None: + raise Exception("Can't get apple chip name") memory = json.loads(result.stdout)["SPHardwareDataType"][0]["physical_memory"] memory = str(memory).lower().replace("gb", "") # TODO: 获取GPU信息 - info["type"] = gpu_name + info["type"] = chip_name info["memory"] = memory except Exception: # noqa return None, None From 9b34452c442d9f8ff4c983821da90496e6a1deeb Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:57:19 +0800 Subject: [PATCH 16/21] feat: delete per cpu usage --- swanlab/data/run/metadata/hardware/cpu.py | 6 +++++- swanlab/data/run/metadata/hardware/soc/apple.py | 2 +- test/unit/data/run/metadata/hardware/test_utils.py | 1 - 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/swanlab/data/run/metadata/hardware/cpu.py b/swanlab/data/run/metadata/hardware/cpu.py index 56075e4d..78ab3bb8 100644 --- a/swanlab/data/run/metadata/hardware/cpu.py +++ b/swanlab/data/run/metadata/hardware/cpu.py @@ -68,4 +68,8 @@ def __init__(self): self.current_process = psutil.Process() def collect(self) -> HardwareInfoList: - return [self.get_cpu_usage(), *self.get_per_cpu_usage(), self.get_cur_proc_thds_num(self.current_process)] + return [ + self.get_cpu_usage(), + # *self.get_per_cpu_usage(), + self.get_cur_proc_thds_num(self.current_process), + ] diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index 2e2d2d7d..8924d839 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -51,7 +51,7 @@ def __init__(self): def collect(self) -> HardwareInfoList: return [ self.get_cpu_usage(), - *self.get_per_cpu_usage(), + # *self.get_per_cpu_usage(), self.get_cur_proc_thds_num(self.current_process), self.get_mem_usage(), *self.get_cur_proc_mem(self.current_process), diff --git a/test/unit/data/run/metadata/hardware/test_utils.py b/test/unit/data/run/metadata/hardware/test_utils.py index 327768e1..45604b90 100644 --- a/test/unit/data/run/metadata/hardware/test_utils.py +++ b/test/unit/data/run/metadata/hardware/test_utils.py @@ -45,5 +45,4 @@ def test_per_cpu_usage(): assert u["config"].y_range == (0, 100) assert u["config"].chart_name == f"CPU Utilization (per core) (%)" # 每个核心的index应该相同,因为必须要放在同一个图表中 - assert u["config"].chart_index == c.per_cpu_usage_index assert u["config"].metric_name == f"CPU {idx}" From bc5f1fcd24fb63163c85ecaab52a6ca1046972e0 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:45:44 +0800 Subject: [PATCH 17/21] fix: cpu test --- test/unit/data/run/metadata/hardware/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/data/run/metadata/hardware/test_utils.py b/test/unit/data/run/metadata/hardware/test_utils.py index 45604b90..811f0ac6 100644 --- a/test/unit/data/run/metadata/hardware/test_utils.py +++ b/test/unit/data/run/metadata/hardware/test_utils.py @@ -27,7 +27,7 @@ def test_cpu_usage(): usage = c.get_cpu_usage() assert usage is not None assert 0 <= usage["value"] <= 100 - assert usage["key"] == "cpu.pct" + assert usage["key"].endswith("cpu.pct") assert usage["name"] == "CPU Utilization (%)" assert usage["config"].y_range == (0, 100) assert usage["config"].chart_name == "CPU Utilization (%)" @@ -40,7 +40,7 @@ def test_per_cpu_usage(): assert usage is not None for idx, u in enumerate(usage): assert 0 <= u["value"] <= 100 - assert u["key"] == f"cpu.{idx}.pct" + assert u["key"].endswith(f"cpu.{idx}.pct") assert u["name"] == f"CPU {idx} Utilization (%)" assert u["config"].y_range == (0, 100) assert u["config"].chart_name == f"CPU Utilization (per core) (%)" From da79a8ecc18fba893a76721547454d20a870c8c6 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 20:35:51 +0800 Subject: [PATCH 18/21] fix: apple soc --- swanlab/data/run/metadata/hardware/soc/apple.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/swanlab/data/run/metadata/hardware/soc/apple.py b/swanlab/data/run/metadata/hardware/soc/apple.py index 8924d839..c130e2d4 100644 --- a/swanlab/data/run/metadata/hardware/soc/apple.py +++ b/swanlab/data/run/metadata/hardware/soc/apple.py @@ -23,16 +23,17 @@ def get_apple_chip_info() -> HardwareFuncResult: # 使用system_profiler命令以JSON格式获取GPU信息 try: result = subprocess.run(["system_profiler", "SPHardwareDataType", "-json"], capture_output=True, text=True) - chip_name = json.loads(result.stdout)["SPHardwareDataType"][0].get("chip_name", None) + hardware_info = json.loads(result.stdout)["SPHardwareDataType"][0] + chip_type = hardware_info.get("chip_type", None) # 早期intel芯片的机器 - if chip_name is None: - chip_name = json.loads(result.stdout)["SPHardwareDataType"][0].get("cpu_type", None) - if chip_name is None: + if chip_type is None: + chip_type = hardware_info.get("cpu_type", None) + if chip_type is None: raise Exception("Can't get apple chip name") - memory = json.loads(result.stdout)["SPHardwareDataType"][0]["physical_memory"] + memory = hardware_info["physical_memory"] memory = str(memory).lower().replace("gb", "") # TODO: 获取GPU信息 - info["type"] = chip_name + info["type"] = chip_type info["memory"] = memory except Exception: # noqa return None, None From 4e728d0a813a28a09cf7a1771da907b0d0ff08db Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 22:42:20 +0800 Subject: [PATCH 19/21] feat: util --- ...41\346\201\257\351\207\207\351\233\206.md" | 6 +- .../data/run/metadata/hardware/__init__.py | 5 +- .../data/run/metadata/hardware/gpu/nvidia.py | 35 +++++++++++- .../data/run/metadata/hardware/npu/ascend.py | 56 +++++++++++++------ 4 files changed, 80 insertions(+), 22 deletions(-) diff --git "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" index 816fbc82..81d8ddee 100644 --- "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" +++ "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" @@ -114,6 +114,10 @@ class MonitorCron: #### GPU Utilization (%) +表示每个GPU的利用率百分比,swanlab为它打了一个 `gpu.{gpu_index}.pct` 标签。 + +#### GPU Memory Allocated (%) + 表示每个GPU的显存利用率百分比,swanlab为它打了一个 `gpu.{gpu_index}.mem.ptc` 标签。 #### GPU Temperature (℃) @@ -131,7 +135,7 @@ class MonitorCron: 根据Ascend NPU的[官方文档](https://support.huawei.com/enterprise/zh/doc/EDOC1100388864/8c5e18a7), 唯一定位一块计算芯片需要同时知道NPU ID和Chip ID,因此对于Ascend NPU而言,`npu_index = f{npu_id}-{chip_id}`。 -#### NPU Utilization (%) +#### GPU Memory Allocated (%) 表示每个NPU的HBM利用率百分比,swanlab为它打了一个 `npu.{npu_index}.mem.ptc` 标签。 diff --git a/swanlab/data/run/metadata/hardware/__init__.py b/swanlab/data/run/metadata/hardware/__init__.py index 18f2fb09..bd5d4fc7 100644 --- a/swanlab/data/run/metadata/hardware/__init__.py +++ b/swanlab/data/run/metadata/hardware/__init__.py @@ -22,11 +22,12 @@ def get_hardware_info() -> Tuple[Optional[Any], List[HardwareCollector]]: 采集硬件信息,包括CPU、GPU、内存、硬盘等 """ monitor_funcs = [] - m = dec_hardware_func(get_memory_size, monitor_funcs) - c = dec_hardware_func(get_cpu_info, monitor_funcs) + # 我们希望计算芯片的信息放在最前面,前端展示用 nvidia = dec_hardware_func(get_nvidia_gpu_info, monitor_funcs) ascend = dec_hardware_func(get_ascend_npu_info, monitor_funcs) apple = dec_hardware_func(get_apple_chip_info, monitor_funcs) + c = dec_hardware_func(get_cpu_info, monitor_funcs) + m = dec_hardware_func(get_memory_size, monitor_funcs) info = { "memory": m, diff --git a/swanlab/data/run/metadata/hardware/gpu/nvidia.py b/swanlab/data/run/metadata/hardware/gpu/nvidia.py index fa9edcc2..d6a67c7f 100644 --- a/swanlab/data/run/metadata/hardware/gpu/nvidia.py +++ b/swanlab/data/run/metadata/hardware/gpu/nvidia.py @@ -68,19 +68,34 @@ class GpuCollector(HardwareCollector): def __init__(self, count: int): super().__init__() + # GPU 利用率 + self.gpu_util_key = generate_key("gpu.{idx}.ptc") + util_config = HardwareConfig(y_range=(0, 100), chart_name="GPU Utilization (%)", chart_index=random_index()) + # GPU 内存使用率 self.gpu_mem_pct_key = generate_key("gpu.{idx}.mem.ptc") - mem_pct_config = HardwareConfig(y_range=(0, 100), chart_name="GPU Utilization (%)", chart_index=random_index()) + mem_pct_config = HardwareConfig( + y_range=(0, 100), chart_name="GPU Memory Allocated (%)", chart_index=random_index() + ) + # GPU 温度 self.gpu_temp_key = generate_key("gpu.{idx}.temp") tem_config = HardwareConfig(chart_name="GPU Temperature (℃)", chart_index=random_index()) + # GPU 功耗 self.gpu_power_key = generate_key("gpu.{idx}.power") power_config = HardwareConfig(chart_name="GPU Power Usage (W)", chart_index=random_index()) - self.per_gpu_configs = {self.gpu_mem_pct_key: [], self.gpu_temp_key: [], self.gpu_power_key: []} + # 每个GPU的配置信息 + self.per_gpu_configs = { + self.gpu_mem_pct_key: [], + self.gpu_temp_key: [], + self.gpu_power_key: [], + self.gpu_util_key: [], + } self.handles = [] for idx in range(count): metric_name = "GPU {idx}".format(idx=idx) self.per_gpu_configs[self.gpu_mem_pct_key].append(mem_pct_config.clone(metric_name=metric_name)) self.per_gpu_configs[self.gpu_temp_key].append(tem_config.clone(metric_name=metric_name)) self.per_gpu_configs[self.gpu_power_key].append(power_config.clone(metric_name=metric_name)) + self.per_gpu_configs[self.gpu_util_key].append(util_config.clone(metric_name=metric_name)) def get_gpu_config(self, key: str, idx: int) -> HardwareConfig: """ @@ -88,6 +103,19 @@ def get_gpu_config(self, key: str, idx: int) -> HardwareConfig: """ return self.per_gpu_configs[key][idx] + def get_gpu_util(self, idx: int) -> HardwareInfo: + """ + 获取 GPU 利用率 + """ + handle = self.handles[idx] + util_info = pynvml.nvmlDeviceGetUtilizationRates(handle) + return { + "key": self.gpu_util_key.format(idx=idx), + "value": util_info.gpu, + "name": "GPU {idx} Utilization (%)".format(idx=idx), + "config": self.get_gpu_config(self.gpu_util_key, idx), + } + def get_gpu_mem_pct(self, idx: int) -> HardwareInfo: """ 获取 GPU 内存使用率 @@ -98,7 +126,7 @@ def get_gpu_mem_pct(self, idx: int) -> HardwareInfo: return { "key": self.gpu_mem_pct_key.format(idx=idx), "value": mem_pct, - "name": "GPU {idx} Utilization (%)".format(idx=idx), + "name": "GPU {idx} Memory Allocated (%)".format(idx=idx), "config": self.get_gpu_config(self.gpu_mem_pct_key, idx), } @@ -135,6 +163,7 @@ def collect(self) -> HardwareInfoList: """ result: HardwareInfoList = [] for idx, handle in enumerate(self.handles): + result.append(self.get_gpu_util(idx)) result.append(self.get_gpu_mem_pct(idx)) result.append(self.get_gpu_temp(idx)) result.append(self.get_gpu_power(idx)) diff --git a/swanlab/data/run/metadata/hardware/npu/ascend.py b/swanlab/data/run/metadata/hardware/npu/ascend.py index 515a47e4..04bb24e4 100644 --- a/swanlab/data/run/metadata/hardware/npu/ascend.py +++ b/swanlab/data/run/metadata/hardware/npu/ascend.py @@ -5,11 +5,12 @@ @description: 华为昇腾NPU信息采集 """ -import math import os import platform import subprocess +import math + from ..type import HardwareFuncResult, HardwareInfoList, HardwareConfig, HardwareInfo, HardwareCollector as H from ..utils import generate_key, random_index @@ -106,12 +107,20 @@ class AscendCollector(H): def __init__(self, npu_map): super().__init__() self.npu_map = npu_map - # HBM Usage Rate(%) + # NPU Utilization (%) + self.util_key = generate_key("npu.{npu_index}.ptc") + util_config = HardwareConfig( + y_range=(0, 100), + chart_index=random_index(), + chart_name="NPU Utilization (%)", + ) + self.per_util_configs = {} + # NPU Memory Allocated (%) self.hbm_rate_key = generate_key("npu.{npu_index}.mem.ptc") hbm_rate_config = HardwareConfig( y_range=(0, 100), chart_index=random_index(), - chart_name="NPU Utilization (%)", + chart_name="NPU Memory Allocated (%)", ) self.per_hbm_configs = {} # NPU Temperature (℃) @@ -126,6 +135,7 @@ def __init__(self, npu_map): for npu_id in npu_map: for chip_id in npu_map[npu_id]: metric_name = f"NPU {npu_id}-{chip_id}" + self.per_util_configs[metric_name] = util_config.clone(metric_name=metric_name) self.per_hbm_configs[metric_name] = hbm_rate_config.clone(metric_name=metric_name) self.per_temp_configs[metric_name] = temp_config.clone(metric_name=metric_name) @@ -133,35 +143,49 @@ def collect(self) -> HardwareInfoList: result: HardwareInfoList = [] for npu_id in self.npu_map: for chip_id in self.npu_map[npu_id]: - result.append(self.get_hbm_rate(npu_id, chip_id)) + result.extend(self.get_usage(npu_id, chip_id)) result.append(self.get_chip_temp(npu_id, chip_id)) return result - def get_hbm_rate(self, npu_id: str, chip_id: str) -> HardwareInfo: + def get_usage(self, npu_id: str, chip_id: str) -> HardwareInfoList: """ - 获取指定NPU设备的芯片HBM的用量信息 + 获取指定NPU设备的芯片HBM的用量信息和利用率 """ output = subprocess.run( ["npu-smi", "info", "-t", "usages", "-i", npu_id, "-c", chip_id], capture_output=True, text=True, ).stdout - rate = math.nan + # 格式化获取NPU ID和芯片ID + _id, metric_name = self.get_label(npu_id, chip_id) + # 获取信息 + util_info = { + "key": self.util_key.format(npu_index=_id), + "name": f"{metric_name} Utilization (%)", + "value": math.nan, + "config": self.per_util_configs[metric_name], + } + hbm_info = { + "key": self.hbm_rate_key.format(npu_index=_id), + "name": f"{metric_name} Memory Allocated (%)", + "value": math.nan, + "config": self.per_hbm_configs[metric_name], + } for line in output.split("\n"): + if "aicore usage rate" in line.lower(): + line = line.split(":") + # 利用率的值在最后一个 + util = line[-1].strip() + if util.isdigit(): + util_info['value'] = float(util) + if "hbm usage rate" in line.lower(): line = line.split(":") # HBM Capacity的值在最后一个 hbm = line[-1].strip() if hbm.isdigit(): - rate = float(hbm) - break - _id, metric_name = self.get_label(npu_id, chip_id) - return { - "key": self.hbm_rate_key.format(npu_index=_id), - "name": f"{metric_name} Utilization (%)", - "value": rate, - "config": self.per_hbm_configs[metric_name], - } + hbm_info['value'] = float(hbm) + return [util_info, hbm_info] def get_chip_temp(self, npu_id: str, chip_id: str) -> HardwareInfo: """ From 0819e4c848c33d10c0b983f88ee7bf87e97de6c7 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 22:46:52 +0800 Subject: [PATCH 20/21] =?UTF-8?q?Update=20=E7=A1=AC=E4=BB=B6=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E9=87=87=E9=9B=86.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\266\344\277\241\346\201\257\351\207\207\351\233\206.md" | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" index 81d8ddee..9a2034e6 100644 --- "a/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" +++ "b/docs/\347\241\254\344\273\266\344\277\241\346\201\257\351\207\207\351\233\206.md" @@ -135,7 +135,11 @@ class MonitorCron: 根据Ascend NPU的[官方文档](https://support.huawei.com/enterprise/zh/doc/EDOC1100388864/8c5e18a7), 唯一定位一块计算芯片需要同时知道NPU ID和Chip ID,因此对于Ascend NPU而言,`npu_index = f{npu_id}-{chip_id}`。 -#### GPU Memory Allocated (%) +#### NPU Utilization (%) + +表示每个NPU的利用率百分比,swanlab为它打了一个 `npu.{npu_index}.pct` 标签。 + +#### NPU Memory Allocated (%) 表示每个NPU的HBM利用率百分比,swanlab为它打了一个 `npu.{npu_index}.mem.ptc` 标签。 From f2147c02f49146e8d610e9e332f5118b9a7dc704 Mon Sep 17 00:00:00 2001 From: KAAANG <79990647+SAKURA-CAT@users.noreply.github.com> Date: Fri, 13 Dec 2024 23:07:49 +0800 Subject: [PATCH 21/21] fix: npu failed --- swanlab/data/run/metadata/hardware/npu/ascend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/swanlab/data/run/metadata/hardware/npu/ascend.py b/swanlab/data/run/metadata/hardware/npu/ascend.py index 04bb24e4..552e335e 100644 --- a/swanlab/data/run/metadata/hardware/npu/ascend.py +++ b/swanlab/data/run/metadata/hardware/npu/ascend.py @@ -5,12 +5,11 @@ @description: 华为昇腾NPU信息采集 """ +import math import os import platform import subprocess -import math - from ..type import HardwareFuncResult, HardwareInfoList, HardwareConfig, HardwareInfo, HardwareCollector as H from ..utils import generate_key, random_index @@ -178,6 +177,7 @@ def get_usage(self, npu_id: str, chip_id: str) -> HardwareInfoList: util = line[-1].strip() if util.isdigit(): util_info['value'] = float(util) + continue if "hbm usage rate" in line.lower(): line = line.split(":") @@ -185,6 +185,7 @@ def get_usage(self, npu_id: str, chip_id: str) -> HardwareInfoList: hbm = line[-1].strip() if hbm.isdigit(): hbm_info['value'] = float(hbm) + continue return [util_info, hbm_info] def get_chip_temp(self, npu_id: str, chip_id: str) -> HardwareInfo: