|
30 | 30 | HCCN_TOOL_PATH = envs.HCCN_PATH |
31 | 31 |
|
32 | 32 |
|
33 | | -def get_device_ips(world_size: int): |
34 | | - npu_info = subprocess.run( |
35 | | - ["npu-smi", "info", "-m"], |
36 | | - stdout=subprocess.PIPE, |
37 | | - stderr=subprocess.PIPE, |
38 | | - universal_newlines=True, |
39 | | - ) |
| 33 | +def get_device_ips(): |
| 34 | + npu_info = subprocess.run(['npu-smi', 'info', '-m'], |
| 35 | + stdout=subprocess.PIPE, |
| 36 | + stderr=subprocess.PIPE, |
| 37 | + universal_newlines=True) |
40 | 38 | if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH): |
41 | 39 | raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.") |
42 | | - npu_start_idx = int( |
43 | | - re.match(r".*\n\t([0-9]+).*", |
44 | | - npu_info.stdout).group(1)) # type: ignore |
| 40 | + |
| 41 | + # Extract NPU IDs for all Ascend devices (excluding Mcu rows) |
| 42 | + device_ids = [] |
| 43 | + for line in npu_info.stdout.strip().split('\n'): |
| 44 | + match = re.match(r'^\s*(\d+)\s+\d+\s+\d+\s+Ascend', line) |
| 45 | + if match: |
| 46 | + device_ids.append(int(match.group(1))) |
| 47 | + |
| 48 | + if not device_ids: |
| 49 | + raise RuntimeError( |
| 50 | + "Cannot parse any valid device ID from npu-smi output.") |
| 51 | + |
45 | 52 | device_ip_list = [] |
46 | | - for ip_offset in range(world_size): |
47 | | - cmd = [ |
48 | | - HCCN_TOOL_PATH, |
49 | | - "-i", |
50 | | - f"{npu_start_idx + ip_offset}", |
51 | | - "-ip", |
52 | | - "-g", |
53 | | - ] |
54 | | - device_ip_info = subprocess.run( |
55 | | - cmd, |
56 | | - stdout=subprocess.PIPE, |
57 | | - stderr=subprocess.PIPE, |
58 | | - universal_newlines=True, |
59 | | - ) |
60 | | - device_ip = re.match(r"ipaddr:(.*)\n", |
61 | | - device_ip_info.stdout).group(1) # type: ignore |
| 53 | + for device_id in device_ids: |
| 54 | + cmd = [HCCN_TOOL_PATH, '-i', str(device_id), '-ip', '-g'] |
| 55 | + device_ip_info = subprocess.run(cmd, |
| 56 | + stdout=subprocess.PIPE, |
| 57 | + stderr=subprocess.PIPE, |
| 58 | + universal_newlines=True) |
| 59 | + ip_match = re.search(r'ipaddr:(.*)', device_ip_info.stdout) |
| 60 | + if not ip_match: |
| 61 | + raise RuntimeError( |
| 62 | + f"Cannot parse IP from hccn_tool for device {device_id}") |
| 63 | + device_ip = ip_match.group(1).strip() |
62 | 64 | device_ip_list.append(device_ip) |
| 65 | + |
63 | 66 | return device_ip_list |
64 | 67 |
|
65 | 68 |
|
66 | | -# Pass number of NPUs into this function. |
67 | | -print(get_device_ips(8)) |
| 69 | +print(get_device_ips()) |
0 commit comments