|
18 | 18 | from vllm.platforms import current_platform |
19 | 19 | from vllm.utils import is_torch_equal_or_newer |
20 | 20 |
|
21 | | -from ..utils import create_new_process_for_each_test |
| 21 | +from ..utils import create_new_process_for_each_test, multi_gpu_test |
22 | 22 |
|
23 | 23 |
|
24 | 24 | def models_list(*, all: bool = True, keywords: list[str] | None = None): |
@@ -237,6 +237,61 @@ def test_default_fusion( |
237 | 237 | assert "Fused quant onto 48 attention nodes" in caplog_vllm.text, caplog_vllm.text |
238 | 238 |
|
239 | 239 |
|
| 240 | +@multi_gpu_test(num_gpus=2) |
| 241 | +@pytest.mark.parametrize("custom_ops", ["+quant_fp8", "-quant_fp8"]) |
| 242 | +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) |
| 243 | +def test_default_fusion_tp2( |
| 244 | + custom_ops: str, inductor_graph_partition: bool, caplog_vllm, monkeypatch |
| 245 | +): |
| 246 | + model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" |
| 247 | + model_kwargs = {"kv_cache_dtype": "fp8", "max_model_len": 1024} |
| 248 | + backend = _Backend.FLASHINFER |
| 249 | + |
| 250 | + custom_ops_list = custom_ops.split(",") if custom_ops else [] |
| 251 | + |
| 252 | + if inductor_graph_partition: |
| 253 | + mode = CUDAGraphMode.FULL_AND_PIECEWISE |
| 254 | + splitting_ops: Optional[list[str]] = None |
| 255 | + else: |
| 256 | + mode = CUDAGraphMode.FULL_DECODE_ONLY |
| 257 | + splitting_ops = [] |
| 258 | + |
| 259 | + # Disable, compile cache to make sure custom passes run. |
| 260 | + # Otherwise, we can't verify fusion happened through the logs. |
| 261 | + # Log capture also doesn't work with multiprocessing yet. |
| 262 | + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") |
| 263 | + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") |
| 264 | + |
| 265 | + model_kwargs["tensor_parallel_size"] = 2 |
| 266 | + compilation_config = CompilationConfig( |
| 267 | + # Testing properties |
| 268 | + use_inductor_graph_partition=inductor_graph_partition, |
| 269 | + cudagraph_mode=mode, |
| 270 | + custom_ops=custom_ops_list, |
| 271 | + splitting_ops=splitting_ops, |
| 272 | + # Common |
| 273 | + level=CompilationLevel.PIECEWISE, |
| 274 | + pass_config=PassConfig( |
| 275 | + enable_attn_fusion=True, |
| 276 | + enable_noop=True, |
| 277 | + enable_fi_allreduce_fusion=True, |
| 278 | + ), |
| 279 | + # Inductor caches custom passes by default as well via uuid |
| 280 | + inductor_compile_config={"force_disable_caches": True}, |
| 281 | + ) |
| 282 | + |
| 283 | + with ( |
| 284 | + caplog_vllm.at_level(logging.DEBUG), |
| 285 | + global_force_attn_backend_context_manager(backend), |
| 286 | + ): |
| 287 | + run_model(compilation_config, model, model_kwargs) |
| 288 | + |
| 289 | + assert "Fused quant onto 48 attention nodes" in caplog_vllm.text, caplog_vllm.text |
| 290 | + |
| 291 | + # TODO fill in correct number |
| 292 | + assert "Replaced 5 patterns" in caplog_vllm.text, caplog_vllm.text |
| 293 | + |
| 294 | + |
240 | 295 | def run_model( |
241 | 296 | compile_config: Union[int, CompilationConfig], |
242 | 297 | model: str, |
|
0 commit comments