Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions tests/e2e/multicard/test_external_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@
import subprocess
import sys
from pathlib import Path
from unittest.mock import patch

import pytest
import torch_npu

MODELS = ["Qwen/Qwen3-0.6B"]
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]


@pytest.mark.parametrize("model", MODELS)
Expand Down Expand Up @@ -147,3 +150,38 @@ def test_external_launcher_and_sleepmode():
assert "Generated text:" in output
assert "Sleep and wake up successfully!!" in output
assert proc.returncode == 0


@pytest.mark.skipif(
DEVICE_NAME != "Ascend910B",
reason="This test is only for Ascend910B devices.",
)
@pytest.mark.parametrize("model", MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1"})
def test_mm_allreduce(model):
script = Path(
__file__
).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
env = os.environ.copy()
cmd = [
sys.executable,
str(script),
"--model",
model,
"--trust-remote-code",
]

print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)

output = proc.stdout.decode()
print(output)

assert "Generated text:" in output
assert proc.returncode == 0
2 changes: 2 additions & 0 deletions vllm_ascend/patch/worker/patch_common/patch_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from vllm.distributed import (get_tensor_model_parallel_rank,
split_tensor_along_last_dim)
from vllm.distributed.parallel_state import get_tp_group
from vllm.logger import logger
from vllm.model_executor.layers.linear import RowParallelLinear

from vllm_ascend import envs
Expand Down Expand Up @@ -142,4 +143,5 @@ def calc_output(self, input_parallel: torch.Tensor) -> torch.Tensor:


if envs.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE:
logger.info("AscendRowParallelLinear: Matmul all-reduce is enabled. ")
vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear
Loading