Skip to content

Commit c62fd91

Browse files
committed
Merge branch 'main' into main
2 parents e82e946 + 6bc82cf commit c62fd91

File tree

12 files changed

+81
-30
lines changed

12 files changed

+81
-30
lines changed

.github/workflows/image_310p_openeuler.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ on:
3636
- '.github/workflows/image_310p_openeuler.yml'
3737
- 'Dockerfile.310p.openEuler'
3838
- 'vllm_ascend/**'
39+
- 'setup.py'
40+
- 'pyproject.toml'
41+
- 'requirements.txt'
42+
- 'cmake/**'
43+
- 'CMakeLists.txt'
44+
- 'csrc/**'
3945

4046
jobs:
4147
build:

.github/workflows/image_310p_ubuntu.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ on:
3636
- '.github/workflows/image_310p_ubuntu.yml'
3737
- 'Dockerfile.310p'
3838
- 'vllm_ascend/**'
39+
- 'setup.py'
40+
- 'pyproject.toml'
41+
- 'requirements.txt'
42+
- 'cmake/**'
43+
- 'CMakeLists.txt'
44+
- 'csrc/**'
3945
jobs:
4046

4147
build:

.github/workflows/image_a3_openeuler.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ on:
3636
- '.github/workflows/image_a3_openeuler.yml'
3737
- 'Dockerfile.a3.openEuler'
3838
- 'vllm_ascend/**'
39+
- 'setup.py'
40+
- 'pyproject.toml'
41+
- 'requirements.txt'
42+
- 'cmake/**'
43+
- 'CMakeLists.txt'
44+
- 'csrc/**'
3945

4046
jobs:
4147
build:

.github/workflows/image_a3_ubuntu.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ on:
3636
- '.github/workflows/image_a3_ubuntu.yml'
3737
- 'Dockerfile.a3'
3838
- 'vllm_ascend/**'
39+
- 'setup.py'
40+
- 'pyproject.toml'
41+
- 'requirements.txt'
42+
- 'cmake/**'
43+
- 'CMakeLists.txt'
44+
- 'csrc/**'
3945
jobs:
4046

4147
build:

.github/workflows/image_openeuler.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ on:
3535
- '.github/workflows/image_openeuler.yml'
3636
- 'Dockerfile.openEuler'
3737
- 'vllm_ascend/**'
38+
- 'setup.py'
39+
- 'pyproject.toml'
40+
- 'requirements.txt'
41+
- 'cmake/**'
42+
- 'CMakeLists.txt'
43+
- 'csrc/**'
3844

3945
jobs:
4046
build:

.github/workflows/image_ubuntu.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ on:
3636
- '.github/workflows/image_ubuntu.yml'
3737
- 'Dockerfile'
3838
- 'vllm_ascend/**'
39+
- 'setup.py'
40+
- 'pyproject.toml'
41+
- 'requirements.txt'
42+
- 'cmake/**'
43+
- 'CMakeLists.txt'
44+
- 'csrc/**'
3945
jobs:
4046

4147
build:

tests/ut/models/test_deepseek_v2.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ def mock_distributed():
122122
@pytest.fixture
123123
def mock_forward_context():
124124
forward_context = Mock(in_profile_run=False, with_prefill=False)
125-
with patch("vllm_ascend.models.deepseek_v2.get_forward_context", return_value=forward_context):
125+
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
126+
return_value=forward_context):
126127
yield
127128

128129

@@ -213,7 +214,8 @@ def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
213214
quant_config=None)
214215

215216

216-
def test_custom_deepseek_v2_moe(mock_distributed, base_config, mock_forward_context):
217+
def test_custom_deepseek_v2_moe(mock_distributed, base_config,
218+
mock_forward_context):
217219
base_config.n_shared_experts = 1
218220
moe = CustomDeepseekV2MoE(config=base_config,
219221
quant_config=None,

tests/ut/ops/test_fused_ops.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import torch.nn as nn
2121
from pytest_mock import MockerFixture
2222

23+
from vllm_ascend.ascend_forward_context import get_fused_moe_state
2324
from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
2425
AscendUnquantizedFusedMoEMethod)
25-
from vllm_ascend.utils import adapt_patch, AscendSocVersion # noqa E402
26-
from vllm_ascend.ascend_forward_context import get_fused_moe_state
26+
from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402
2727

2828
adapt_patch(True)
2929

@@ -245,8 +245,11 @@ def test_forward(self, mock_dist_env, default_moe_config, others_param):
245245
moe.moe_parallel_config.ep_size = 1
246246

247247
moe.quant_method = MockQuantMethod(shared_experts, num_tokens)
248-
forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens, dtype=torch.bool), padded_num_tokens=num_tokens)
249-
with patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context):
248+
forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens,
249+
dtype=torch.bool),
250+
padded_num_tokens=num_tokens)
251+
with patch("vllm_ascend.ops.fused_moe.get_forward_context",
252+
return_value=forward_context):
250253
output = moe.forward(inputs,
251254
router_logits,
252255
is_prefill=is_prefill,
@@ -306,7 +309,8 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
306309
global_num_experts, ep_size, select_softmax = others_param
307310
is_prefill = False
308311
is_deepseek_v3_r1 = global_num_experts == 256
309-
forward_context = MagicMock(fused_moe_state=get_fused_moe_state(ep_size, is_prefill, is_deepseek_v3_r1))
312+
forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
313+
ep_size, is_prefill, is_deepseek_v3_r1))
310314
with patch(
311315
"vllm_ascend.ops.fused_moe.SELECT_GATING_TOPK_SOTFMAX_EXPERTS",
312316
select_softmax), \
@@ -342,7 +346,8 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
342346
"""
343347
ep_size, alltoall_buffer = others_param
344348
is_prefill = False
345-
forward_context = MagicMock(fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
349+
forward_context = MagicMock(
350+
fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
346351
with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER",
347352
alltoall_buffer), \
348353
patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \

vllm_ascend/attention/attention_v1.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -121,28 +121,36 @@ class AscendAttentionState(Enum):
121121

122122
@dataclass
123123
class AscendMetadata:
124-
num_actual_tokens: int # Number of tokens excluding padding.
125-
# (batch_size, max_blocks_per_seq).
126-
# Block addresses per sequence. (Seq id -> list of physical block)
127-
block_tables: torch.Tensor
128-
# (batch_size,). The sequence length per sequence. Sequence length means
129-
# the computed tokens + new tokens None if it is a decoding.
130-
query_start_loc: torch.Tensor
131-
query_lens: torch.Tensor
132-
seq_lens: torch.Tensor
133-
# Maximum query length in the batch. None for decoding.
134-
max_query_len: Optional[int] = None
135-
# (num_tokens,). The indices of the token slots that input tokens will be
136-
# stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
137-
# is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
138-
# in block 0, and 1st slot in block 1, respectively.
139-
slot_mapping: torch.Tensor = None
124+
125+
# **************************** Basic Properties ****************************
126+
attn_mask: Optional[torch.Tensor] = None
140127
# Current state of this attention run.
141128
attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill
142-
attn_mask: Optional[torch.Tensor] = None
143129

144-
# For logging.
145-
num_input_tokens: int = 0 # Number of tokens including padding.
130+
# Number of tokens excluding padding.
131+
num_actual_tokens: int = 0
132+
133+
# The sequence length per sequence. Sequence length means the computed
134+
# tokens + new tokens (is None if it is a decoding).
135+
# (batch_size,)
136+
seq_lens: torch.Tensor = None
137+
138+
query_start_loc: torch.Tensor = None
139+
query_lens: torch.Tensor = None
140+
# Maximum query length in the batch (None for decoding).
141+
max_query_len: Optional[int] = None
142+
143+
# ********************** KV Cache Related Properties ***********************
144+
# Block addresses per sequence (Seq id -> list of physical block).
145+
# (batch_size, max_blocks_per_seq)
146+
block_tables: torch.Tensor = None
147+
148+
# The indices of the token slots that input tokens will be stored into.
149+
# E.g., if `slot_mapping` is [35, 2, 17] and the block size is 16, the
150+
# three tokens are stored in the 3rd slot in block 2, 2nd slot in block 0,
151+
# and 1st slot in block 1, respectively.
152+
# (num_tokens,)
153+
slot_mapping: torch.Tensor = None
146154

147155

148156
class AscendAttentionMetadataBuilder:

vllm_ascend/models/qwen2_5_vl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def forward(
107107
for x in (q, k, v)
108108
]
109109

110-
context_layer = torch.torch.empty_like(q)
110+
context_layer = torch.empty_like(q)
111111

112112
# operator requires pta version >= 2.5.1
113113
torch_npu._npu_flash_attention_unpad(

0 commit comments

Comments
 (0)