|
11 | 11 | from vllm.distributed.parallel_state import get_pp_group, get_tp_group |
12 | 12 | from vllm.logger import init_logger |
13 | 13 | from vllm.model_executor.utils import set_random_seed |
14 | | -from vllm.platforms import current_platform |
| 14 | +from vllm.platforms import CpuArchEnum, current_platform |
15 | 15 | from vllm.sequence import IntermediateTensors |
16 | 16 | from vllm.v1.core.sched.output import SchedulerOutput |
17 | 17 | from vllm.v1.outputs import ModelRunnerOutput |
@@ -43,8 +43,12 @@ def init_device(self): |
43 | 43 | omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND |
44 | 44 | self.local_omp_cpuid = "all" |
45 | 45 | if omp_cpuids == "auto": |
46 | | - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( |
47 | | - ) |
| 46 | + if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC: |
| 47 | + self.local_omp_cpuid = ( |
| 48 | + self.get_cpus_id_binding_based_on_numa_nodes_ppc64le()) |
| 49 | + else: |
| 50 | + self.local_omp_cpuid = ( |
| 51 | + self.get_cpus_id_binding_based_on_numa_nodes()) |
48 | 52 | else: |
49 | 53 | self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] |
50 | 54 |
|
@@ -153,3 +157,57 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str: |
153 | 157 | "fallback to no thread-binding. To get better performance," |
154 | 158 | "please try to manually bind threads.") |
155 | 159 | return rank_to_cpus |
| 160 | + |
| 161 | + def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: |
| 162 | + """ |
| 163 | + Power (ppc64le) specific: Selects a subset of threads per core for |
| 164 | + each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc) |
| 165 | + because the OS only exposes available threads.This maximizes |
| 166 | + performance by avoiding oversubscription of logical CPUs on Power. |
| 167 | + """ |
| 168 | + |
| 169 | + def select_threads_per_power_core(node_cpu_ids): |
| 170 | + return [cpu for cpu in node_cpu_ids if cpu % 8 < 4] |
| 171 | + |
| 172 | + rank_to_cpus = self.local_omp_cpuid |
| 173 | + world_size = self.vllm_config.parallel_config.world_size |
| 174 | + libnuma_found = util.find_spec("numa") is not None |
| 175 | + psutil_found = util.find_spec("psutil") is not None |
| 176 | + if libnuma_found and psutil_found: |
| 177 | + import psutil |
| 178 | + from numa import info |
| 179 | + cpus_allow_list = psutil.Process().cpu_affinity() |
| 180 | + numa_size = info.get_num_configured_nodes() |
| 181 | + |
| 182 | + node_to_cpus = [] |
| 183 | + for i in range(numa_size): |
| 184 | + node_intersect = set( |
| 185 | + info.node_to_cpus(i)).intersection(cpus_allow_list) |
| 186 | + if bool(node_intersect): |
| 187 | + node_to_cpus.append(sorted(list(node_intersect))) |
| 188 | + |
| 189 | + if world_size > len(node_to_cpus): |
| 190 | + logger.error( |
| 191 | + "Auto thread-binding failed due to " |
| 192 | + "world size: %d is larger than " |
| 193 | + "allowed NUMA nodes number: %d." |
| 194 | + "Please try to bind threads manually.", world_size, |
| 195 | + len(node_to_cpus)) |
| 196 | + else: |
| 197 | + node_cpus_this_rank = node_to_cpus[self.rank] |
| 198 | + node_cpus_this_rank = select_threads_per_power_core( |
| 199 | + node_cpus_this_rank) |
| 200 | + cpu_count_per_numa = len(node_cpus_this_rank) |
| 201 | + num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, |
| 202 | + cpu_count_per_numa // 2) |
| 203 | + end = cpu_count_per_numa - num_of_reserved_cpu |
| 204 | + rank_to_cpus_list = node_cpus_this_rank[:end] |
| 205 | + rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) |
| 206 | + logger.info("ppc64le thread-binding list: %s", rank_to_cpus) |
| 207 | + else: |
| 208 | + logger.warning( |
| 209 | + "Auto thread-binding is not supported due to " |
| 210 | + "the lack of package numa and psutil," |
| 211 | + "fallback to no thread-binding. To get better performance," |
| 212 | + "please try to manually bind threads.") |
| 213 | + return rank_to_cpus |
0 commit comments