[Bugfix] set OMP_NUM_THREADS to 1 by default for multiprocessing (#6109)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
vllm-project · Jul 3, 2024 · 1dab9bc · 1dab9bc
1 parent 3de6e6a
commit 1dab9bc
Showing 1 changed file with 5 additions and 0 deletions.
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
@@ -37,6 +37,11 @@ def _init_executor(self) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")