[XPU] Use spawn with XPU multiprocessing

dvrogozh · dvrogozh · commit 85988906be0b · 2025-07-08T17:53:09.000-07:00
It's required to use `spawn` start method running XPU backend with
multiprocessing. There are 2 places in vllm where this needs to be
fixed:
* One in `vllm/utils`
* Another in `test/utils`

Fix in the test adjusts `create_new_process_for_each_test` decorator
which further needs to be used for the actual test. Some tests are
already marked with it due to work done for ROCm. In some cases it
might still be missing or `fork_new_process_for_each_test` used instead.

This commit unlocks running a number of tests on xpu and allows tolook
into actual runtime issues. Commit behavior can be tried on these tests:
* `tests/v1/engine/test_llm_engine.py::test_engine_metrics`
* `tests/v1/e2e/test_cascade_attention.py`

Error happenning before the fix:
```
RuntimeError: Cannot re-initialize XPU in forked subprocess. To use XPU with multiprocessing, you must use the 'spawn' start method
```

Signed-off-by: Dmitry Rogozhkin &lt;dmitry.v.rogozhkin@intel.com&gt;
diff --git a/tests/utils.py b/tests/utils.py
@@ -818,14 +818,15 @@ def create_new_process_for_each_test(
 
     Args:
         method: The process creation method. Can be either "spawn" or "fork". 
-               If not specified,
-               it defaults to "spawn" on ROCm platforms and "fork" otherwise.
+               If not specified, it defaults to "spawn" on ROCm and XPU
+               platforms and "fork" otherwise.
 
     Returns:
         A decorator to run test functions in separate processes.
     """
     if method is None:
-        method = "spawn" if current_platform.is_rocm() else "fork"
+        use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
+        method = "spawn" if use_spawn else "fork"
 
     assert method in ["spawn",
                       "fork"], "Method must be either 'spawn' or 'fork'"
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
@@ -5,10 +5,10 @@
 
 from vllm import LLM, SamplingParams
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("attn_backend",
                          ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
 def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -1535,6 +1535,13 @@ def cuda_is_initialized() -> bool:
     return torch.cuda.is_initialized()
 
 
+def xpu_is_initialized() -> bool:
+    """Check if XPU is initialized."""
+    if not torch.xpu._is_compiled():
+        return False
+    return torch.xpu.is_initialized()
+
+
 def cuda_get_device_properties(device,
                                names: Sequence[str],
                                init_cuda=False) -> tuple[Any, ...]:
@@ -2848,6 +2855,8 @@ def _maybe_force_spawn():
     reason = None
     if cuda_is_initialized():
         reason = "CUDA is initialized"
+    elif xpu_is_initialized():
+        reason = "XPU is initialized"
     elif is_in_ray_actor():
         # even if we choose to spawn, we need to pass the ray address
         # to the subprocess so that it knows how to connect to the ray cluster.