|
17 | 17 |
|
18 | 18 | from dataclasses import dataclass |
19 | 19 | from enum import Enum |
20 | | -from typing import List, Optional, Tuple, Type, ClassVar |
| 20 | +from typing import ClassVar, List, Optional, Tuple, Type |
21 | 21 |
|
22 | 22 | import torch |
23 | 23 | import torch.nn as nn |
|
32 | 32 | from vllm.forward_context import ForwardContext, get_forward_context |
33 | 33 | from vllm.utils import cdiv, direct_register_custom_op |
34 | 34 | from vllm.v1.core.sched.output import SchedulerOutput |
| 35 | +from vllm.v1.kv_cache_interface import AttentionSpec |
35 | 36 |
|
36 | 37 | from vllm_ascend.attention.utils import AscendCommonAttentionMetadata |
37 | 38 | from vllm_ascend.ops.attention import vanilla_chunked_prefill |
38 | 39 | from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, |
39 | 40 | nd_to_nz_2d, nd_to_nz_spec) |
40 | | -from vllm_ascend.worker.npu_input_batch import InputBatch |
41 | | -from vllm.v1.kv_cache_interface import AttentionSpec |
42 | 41 |
|
43 | 42 |
|
44 | 43 | def wait_for_kv_layer_from_connector(layer_name: str): |
@@ -582,7 +581,7 @@ def unified_ascend_attention_with_output( |
582 | 581 | attn_metadata = attn_metadata[layer_name] |
583 | 582 | self = forward_context.no_compile_layers[layer_name] |
584 | 583 | kv_cache = self.kv_cache[forward_context.virtual_engine] |
585 | | - print(100*"^", f"layer_name: {layer_name}") |
| 584 | + print(100 * "^", f"layer_name: {layer_name}") |
586 | 585 | self.impl.forward(self, |
587 | 586 | query, |
588 | 587 | key, |
|
0 commit comments