vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/ops.h‎
Lines changed: 18 additions & 1 deletion b/‎csrc/ops.h‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎csrc/torch_binding.cpp‎
Lines changed: 2 additions & 0 deletions b/‎csrc/torch_binding.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements-dev.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/compile/__init__.py‎ b/‎tests/compile/__init__.py‎
diff --git a/‎tests/compile/test_simple.py‎
Lines changed: 121 additions & 0 deletions b/‎tests/compile/test_simple.py‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎vllm_ascend/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎vllm_ascend/__init__.py‎
Lines changed: 5 additions & 0 deletions
@@ -115,7 +115,7 @@ jobs:
       - name: Install vllm-project/vllm-ascend
         run: |
           pip install -r requirements-dev.txt
-          pip install -e .
+          pip install -v --no-build-isolation -e .
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
 
@@ -21,6 +21,7 @@
 
 #include <vector>
 #include "kernels/types.h"
+#include "torch_npu/csrc/aten/common/from_blob.h"
 
 namespace vllm_ascend {
   extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst,
@@ -29,4 +30,20 @@ namespace vllm_ascend {
     const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
     const int headSize, const int64_t numTokens, const uint32_t loopCnt,
     uint32_t aivNum);
-}
+
+  torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
+    if (!tensor.is_privateuseone()) {
+      throw std::runtime_error("Tensor must be on NPU device");
+    }
+    // Get the raw data pointer
+    void* data_ptr = tensor.data_ptr();
+    // Get tensor sizes and strides
+    std::vector<int64_t> sizes = tensor.sizes().vec();
+    std::vector<int64_t> strides = tensor.strides().vec();
+    // Get tensor options (dtype, device)
+    auto options = tensor.options();
+    // Create a new tensor from the raw data pointer
+    auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
+    return new_tensor;
+  }
+}
@@ -103,6 +103,8 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T
 TORCH_LIBRARY_EXPAND(_C, ops)
 {
     // vLLM-Ascend custom ops
+    ops.def("weak_ref_tensor(Tensor input) -> Tensor");
+    ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);
 
     // Rotary embedding
     // Apply GPT-NeoX style rotary embedding to query and key.
 
@@ -1,4 +1,5 @@
 -r requirements-lint.txt
+-r requirements.txt
 modelscope
 pytest >= 6.0
 pytest-asyncio
 
@@ -3,6 +3,7 @@ cmake>=3.26
 decorator
 numpy<2.0.0
 packaging
+pip
 pybind11
 pyyaml
 scipy
@@ -11,3 +12,4 @@ setuptools-scm>=8
 torch_npu
 torch >= 2.5.1
 torchvision<0.21.0
+wheel
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test the piecewise compilation with a simple model so that we
+can exactly calculate the expected output and side effects.
+"""
+
+import os
+
+import torch
+import vllm_ascend  # noqa: F401
+from torch import nn
+from torch.library import Library
+from torch_npu.contrib import transfer_to_npu  # noqa: F401
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config,)
+from vllm.utils import direct_register_custom_op
+
+global_counter = 0
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    global global_counter
+    global_counter += 1
+    print(f"{global_counter=}")
+    out.copy_(q)
+    out[0] += 1
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    dispatch_key="PrivateUse1",
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class SillyModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Overall effect:
+        x += 1
+        x[0] += 2
+        global_counter += 2
+        """
+        x = x + 1
+        x = x + 2
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x - 2
+        x = x - 1
+        out = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, out)
+        x = out
+        x = x + 1
+        return x
+
+
+def test_simple_piecewise_compile():
+
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_inductor=False,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_copy_inputs=True,
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    vllm_config.compilation_config.pass_config.enable_fusion = False
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100).npu()
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_cudagraph_caputured=
+            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+
+        model(inputs)
+
+        model(torch.randn(2).npu())
+        model(torch.randn(1).npu())
+
+        input = torch.zeros(2).npu()
+        global global_counter
+        global_counter = 0
+        output = model(input)
+        assert global_counter == 2
+        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+
+
+if __name__ == "__main__":
+    os.environ["VLLM_USE_V1"] = "1"
+    test_simple_piecewise_compile()
@@ -15,6 +15,8 @@
 # This file is a part of the vllm-ascend project.
 #
 
+from .utils import register_dummy_fusion_op
+
 
 def register():
     """Register the NPU platform."""
@@ -28,3 +30,6 @@ def register():
 def register_model():
     from .models import register_model
     register_model()
+
+
+register_dummy_fusion_op()
Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,8 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T`
`103`	`103`	`TORCH_LIBRARY_EXPAND(_C, ops)`
`104`	`104`	`{`
`105`	`105`	`// vLLM-Ascend custom ops`
	`106`	`+ ops.def("weak_ref_tensor(Tensor input) -> Tensor");`
	`107`	`+ ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);`
`106`	`108`
`107`	`109`	`// Rotary embedding`
`108`	`110`	`// Apply GPT-NeoX style rotary embedding to query and key.`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`-r requirements-lint.txt`
	`2`	`+-r requirements.txt`
`2`	`3`	`modelscope`
`3`	`4`	`pytest >= 6.0`
`4`	`5`	`pytest-asyncio`