Skip to content

Commit 490ac86

Browse files
committed
Add TP=2 test (untested)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
1 parent c6d6c3b commit 490ac86

File tree

1 file changed

+56
-1
lines changed

1 file changed

+56
-1
lines changed

tests/compile/test_full_graph.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from vllm.platforms import current_platform
1919
from vllm.utils import is_torch_equal_or_newer
2020

21-
from ..utils import create_new_process_for_each_test
21+
from ..utils import create_new_process_for_each_test, multi_gpu_test
2222

2323

2424
def models_list(*, all: bool = True, keywords: list[str] | None = None):
@@ -237,6 +237,61 @@ def test_default_fusion(
237237
assert "Fused quant onto 48 attention nodes" in caplog_vllm.text, caplog_vllm.text
238238

239239

240+
@multi_gpu_test(num_gpus=2)
241+
@pytest.mark.parametrize("custom_ops", ["+quant_fp8", "-quant_fp8"])
242+
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
243+
def test_default_fusion_tp2(
244+
custom_ops: str, inductor_graph_partition: bool, caplog_vllm, monkeypatch
245+
):
246+
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
247+
model_kwargs = {"kv_cache_dtype": "fp8", "max_model_len": 1024}
248+
backend = _Backend.FLASHINFER
249+
250+
custom_ops_list = custom_ops.split(",") if custom_ops else []
251+
252+
if inductor_graph_partition:
253+
mode = CUDAGraphMode.FULL_AND_PIECEWISE
254+
splitting_ops: Optional[list[str]] = None
255+
else:
256+
mode = CUDAGraphMode.FULL_DECODE_ONLY
257+
splitting_ops = []
258+
259+
# Disable, compile cache to make sure custom passes run.
260+
# Otherwise, we can't verify fusion happened through the logs.
261+
# Log capture also doesn't work with multiprocessing yet.
262+
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
263+
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
264+
265+
model_kwargs["tensor_parallel_size"] = 2
266+
compilation_config = CompilationConfig(
267+
# Testing properties
268+
use_inductor_graph_partition=inductor_graph_partition,
269+
cudagraph_mode=mode,
270+
custom_ops=custom_ops_list,
271+
splitting_ops=splitting_ops,
272+
# Common
273+
level=CompilationLevel.PIECEWISE,
274+
pass_config=PassConfig(
275+
enable_attn_fusion=True,
276+
enable_noop=True,
277+
enable_fi_allreduce_fusion=True,
278+
),
279+
# Inductor caches custom passes by default as well via uuid
280+
inductor_compile_config={"force_disable_caches": True},
281+
)
282+
283+
with (
284+
caplog_vllm.at_level(logging.DEBUG),
285+
global_force_attn_backend_context_manager(backend),
286+
):
287+
run_model(compilation_config, model, model_kwargs)
288+
289+
assert "Fused quant onto 48 attention nodes" in caplog_vllm.text, caplog_vllm.text
290+
291+
# TODO fill in correct number
292+
assert "Replaced 5 patterns" in caplog_vllm.text, caplog_vllm.text
293+
294+
240295
def run_model(
241296
compile_config: Union[int, CompilationConfig],
242297
model: str,

0 commit comments

Comments
 (0)