vllm-project
diff --git a/‎csrc/torch_binding.cpp‎
Lines changed: 29 additions & 1 deletion b/‎csrc/torch_binding.cpp‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎vllm_ascend/attention/attention_v1.py‎
Lines changed: 127 additions & 51 deletions b/‎vllm_ascend/attention/attention_v1.py‎
Lines changed: 127 additions & 51 deletions
diff --git a/‎vllm_ascend/platform.py‎
Lines changed: 16 additions & 30 deletions b/‎vllm_ascend/platform.py‎
Lines changed: 16 additions & 30 deletions
@@ -20,6 +20,7 @@
 #include <torch_npu/csrc/core/npu/NPUStream.h>
 #include <torch_npu/csrc/framework/OpCommand.h>
 #include <torch_npu/csrc/npu/Module.h>
+#include <torch_npu/csrc/aten/common/from_blob.h>
 #include <pybind11/pybind11.h>
 #include "acl/acl.h"
 #include "tiling/platform/platform_ascendc.h"
@@ -73,7 +74,7 @@ void rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key,
     aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
     at_npu::native::OpCommand cmd;
     cmd.Name("rotary_embedding");
-    cmd.SetCustomHandler([scalar_type, is_neox, num_tokens, stream, position_ids_ptr, 
+    cmd.SetCustomHandler([scalar_type, is_neox, num_tokens, stream, position_ids_ptr,
                           query_ptr, key_ptr, cos_sin_cache_ptr, rot_dim, query_stride, key_stride,
                           num_heads, num_kv_heads, head_size]() -> int {
         auto dtype_num = get_dtype_from_torch(scalar_type);
@@ -90,6 +91,30 @@ void rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key,
     cmd.Run();
     return ;
 }
+
+torch::Tensor weak_ref_tensor(torch::Tensor& tensor)
+{
+    // Ensure tensor is on NPU
+    if (tensor.is_privateuseone()) {
+      throw std::runtime_error("Tensor must be on NPU device");
+    }
+
+    // Get the raw data pointer
+    void* data_ptr = tensor.data_ptr();
+
+    // Get tensor sizes and strides
+    std::vector<int64_t> sizes = tensor.sizes().vec();
+    std::vector<int64_t> strides = tensor.strides().vec();
+
+    // Get tensor options (dtype, device)
+    auto options = tensor.options();
+
+    // Create a new tensor from the raw data pointer
+    auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
+
+    return new_tensor;
+}
+
 } // namespace vllm_ascend
 
 TORCH_LIBRARY_EXPAND(_C, ops)
@@ -103,6 +128,9 @@ TORCH_LIBRARY_EXPAND(_C, ops)
         "                 Tensor! key, int head_size,"
         "                 Tensor cos_sin_cache, bool is_neox) -> ()");
     ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding);
+
+    ops.def("weak_ref_tensor", &weak_ref_tensor);
+    ops.impl("weak_ref_tensor", &weak_ref_tensor);
 }
 
 REGISTER_EXTENSION(_C)
@@ -23,9 +23,10 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
-
+from vllm.utils import direct_register_custom_op
 
 class AscendAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
@@ -167,59 +168,134 @@ def forward(
             shape = [batch_size * seq_len, num_heads, head_size]
         """
         num_tokens = query.shape[0]
-        output = torch.empty(num_tokens,
+        if output is None:
+            output = torch.empty(num_tokens,
                              self.num_heads,
                              self.head_size,
                              dtype=query.dtype,
                              device=query.device)
+        torch.ops.vllm.unified_ascend_attention_with_output(
+            layer=layer,
+            query=query,
+            key=key,
+            value=value,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+            output=output,
+            self_num_heads=self.num_heads,
+            self_head_size=self.head_size,
+            self_scale=self.scale,
+            self_num_kv_heads=self.num_kv_heads,
+            self_hidden_size=self.hidden_size,
+            self_kv_cache_dtype=self.kv_cache_dtype,
+            self_sliding_window=self.sliding_window,
+            self_alibi_slopes=self.alibi_slopes,
+            self_attn_type=self.attn_type,
+            self_num_queries_per_kv=self.num_queries_per_kv,
+            self_seq_len_cpu_tensor=self.seq_len_cpu_tensor,
+        )
 
-        if attn_metadata is None:
-            # Profiling run.
-            return output.view(num_tokens, self.hidden_size)
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        attn_type = self.attn_type
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
-        # View q k v to BSH.
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        # TODO: Remove this contiguous in the future.
-        value = value.contiguous()
-
-        if hasattr(layer, 'quant_method'):
-            # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
-            pass
-        else:
-            if kv_cache.numel() > 0:
-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                num_blocks, block_size, _ = key_cache.shape
-                key_cache = key_cache.view(num_blocks, block_size,
-                                           self.num_kv_heads, self.head_size)
-                value_cache = value_cache.view(num_blocks, block_size,
-                                               self.num_kv_heads,
-                                               self.head_size)
-                slots = attn_metadata.slot_mapping
-                torch_npu._npu_reshape_and_cache(key=key,
-                                                 value=value,
-                                                 key_cache=key_cache,
-                                                 value_cache=value_cache,
-                                                 slot_indices=slots)
-
-            # use paged attention
-            torch_npu._npu_paged_attention_splitfuse(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                mask=attn_metadata.attn_mask,
-                block_table=attn_metadata.block_tables,
-                seq_len=attn_metadata.seq_lens,
-                context_lens=attn_metadata.context_lens,
-                num_kv_heads=self.num_kv_heads,
-                num_heads=self.num_heads,
-                scale_value=self.scale,
-                out=output)
         return output.view(num_tokens, self.hidden_size)
+
+
+def unified_ascend_attention_with_output(
+    layer: AttentionLayer,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_metadata: AscendMetadata,
+    output: torch.Tensor,
+    self_num_heads: int,
+    self_head_size: int,
+    self_scale: float,
+    self_num_kv_heads: int,
+    self_hidden_size: int,
+    self_kv_cache_dtype: str,
+    self_sliding_window: Optional[int],
+    self_alibi_slopes: torch.Tensor,
+    self_attn_type: str,
+    self_num_queries_per_kv: int,
+    self_seq_len_cpu_tensor: int,
+) -> None:
+    num_tokens = query.shape[0]
+    if attn_metadata is None:
+        return output.view(num_tokens, self_hidden_size)
+    assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
+    attn_type = self_attn_type
+    if attn_type != AttentionType.DECODER:
+        raise NotImplementedError("Encoder self-attention and "
+                                    "encoder/decoder cross-attention "
+                                    "are not implemented for "
+                                    "PallasAttentionBackendImpl")
+    # View q k v to BSH.
+    query = query.view(-1, self_num_heads, self_head_size)
+    key = key.view(-1, self_num_kv_heads, self_head_size)
+    value = value.view(-1, self_num_kv_heads, self_head_size)
+    # TODO: Remove this contiguous in the future.
+    value = value.contiguous()
+
+    if hasattr(layer, 'quant_method'):
+        # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
+        pass
+    else:
+        if kv_cache.numel() > 0:
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            num_blocks, block_size, _ = key_cache.shape
+            key_cache = key_cache.view(num_blocks, block_size,
+                                        self_num_kv_heads, self_head_size)
+            value_cache = value_cache.view(num_blocks, block_size,
+                                            self_num_kv_heads,
+                                            self_head_size)
+            slots = attn_metadata.slot_mapping
+            torch_npu._npu_reshape_and_cache(key=key,
+                                                value=value,
+                                                key_cache=key_cache,
+                                                value_cache=value_cache,
+                                                slot_indices=slots)
+
+        # use paged attention
+        torch_npu._npu_paged_attention_splitfuse(
+            query=query,
+            key_cache=key_cache,
+            value_cache=value_cache,
+            mask=attn_metadata.attn_mask,
+            block_table=attn_metadata.block_tables,
+            seq_len=attn_metadata.seq_lens,
+            context_lens=attn_metadata.context_lens,
+            num_kv_heads=self_num_kv_heads,
+            num_heads=self_num_heads,
+            scale_value=self_scale,
+            out=output)
+
+
+def unified_attention_with_output_fake(
+    layer: AttentionLayer,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_metadata: AscendMetadata,
+    output: torch.Tensor,
+    self_num_heads: int,
+    self_head_size: int,
+    self_scale: float,
+    self_num_kv_heads: int,
+    self_hidden_size: int,
+    self_kv_cache_dtype: str,
+    self_sliding_window: Optional[int],
+    self_alibi_slopes: torch.Tensor,
+    self_attn_type: str,
+    self_num_queries_per_kv: int,
+    self_seq_len_cpu_tensor: int,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_ascend_attention_with_output",
+    op_func=unified_ascend_attention_with_output,
+    mutates_args=["output"],
+    fake_impl=unified_attention_with_output_fake,
+    dispatch_key="PrivateUse1",
+)
@@ -15,36 +15,19 @@
 # limitations under the License.
 #
 
-import logging
 import os
 from typing import TYPE_CHECKING, Optional, Tuple
 
 import torch
 import torch_npu  # noqa: F401
 import vllm.envs as envs
-from vllm.config import CompilationLevel
+from vllm.config import CompilationLevel, VllmConfig, ModelConfig
 from vllm.logger import init_logger
-
-try:
-    # register custom ops into torch_library here
-    import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
-
-except ImportError as e:
-    if not str(
-            e
-    ) == "dynamic module does not define module export function (PyInit_vllm_ascend_C)":
-        logging.warning(
-            "Warning: Failed to register custom ops, all custom ops will be disabled"
-        )
-
 from vllm.platforms import Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig, VllmConfig
     from vllm.utils import FlexibleArgumentParser
 else:
-    ModelConfig = None
-    VllmConfig = None
     FlexibleArgumentParser = None
 
 os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
@@ -106,7 +89,13 @@ def mem_get_info(cls) -> Tuple[int, int]:
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         compilation_config = vllm_config.compilation_config
-        if compilation_config and compilation_config.level != CompilationLevel.NO_COMPILATION:
+        if compilation_config and compilation_config.level == CompilationLevel.PIECEWISE:
+            logger.warning(
+                "Compilation level %s is supported on NPU now, But use_inductor is no support",
+                compilation_config.level)
+            compilation_config.use_inductor = False
+            compilation_config.splitting_ops = ["vllm.unified_ascend_attention_with_output"]
+        elif compilation_config and compilation_config.level != CompilationLevel.NO_COMPILATION:
             logger.warning(
                 "Compilation level %s is not supported on NPU now, forcing compilation level to NO_COMPILATION",
                 compilation_config.level)
@@ -125,14 +114,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
 
         cache_config = vllm_config.cache_config
-        if cache_config:
-            if cache_config.block_size is None:
-                cache_config.block_size = 128
-            if envs.VLLM_USE_V1 and cache_config.enable_prefix_caching:
-                logger.warning(
-                    "Prefix caching is not supported for V1 now, disable prefix caching"
-                )
-                cache_config.enable_prefix_caching = False
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 128
+
+        if envs.VLLM_USE_V1 and cache_config and cache_config.enable_prefix_caching:
+            logger.warning(
+                "Prefix caching is not supported for V1 now, disable prefix caching"
+            )
+            cache_config.enable_prefix_caching = False
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
@@ -160,7 +149,4 @@ def is_pin_memory_available(cls):
 
     @classmethod
     def supports_v1(cls, model_config: ModelConfig) -> bool:
-        """Returns whether the current platform can support v1 for the supplied
-        model configuration.
-        """
         return True