|  | 
|  | 1 | +# SPDX-License-Identifier: Apache-2.0 | 
|  | 2 | +""" | 
|  | 3 | +Test the piecewise compilation with a simple model so that we | 
|  | 4 | +can exactly calculate the expected output and side effects. | 
|  | 5 | +""" | 
|  | 6 | + | 
|  | 7 | +import pytest | 
|  | 8 | +import torch | 
|  | 9 | +from torch import nn | 
|  | 10 | +from torch.library import Library | 
|  | 11 | +from vllm.compilation.counter import compilation_counter | 
|  | 12 | +from vllm.compilation.decorators import support_torch_compile | 
|  | 13 | +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, | 
|  | 14 | +                         set_current_vllm_config) | 
|  | 15 | +from vllm.utils import direct_register_custom_op | 
|  | 16 | + | 
|  | 17 | +global_counter = 0 | 
|  | 18 | + | 
|  | 19 | +# create a library to hold the custom op | 
|  | 20 | +silly_lib = Library("silly", "FRAGMENT")  # noqa | 
|  | 21 | + | 
|  | 22 | + | 
|  | 23 | +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, | 
|  | 24 | +                    out: torch.Tensor) -> None: | 
|  | 25 | +    global global_counter | 
|  | 26 | +    global_counter += 1 | 
|  | 27 | +    print(f"{global_counter=}") | 
|  | 28 | +    out.copy_(q) | 
|  | 29 | +    out[0] += 1 | 
|  | 30 | + | 
|  | 31 | + | 
|  | 32 | +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, | 
|  | 33 | +                         out: torch.Tensor) -> None: | 
|  | 34 | +    return | 
|  | 35 | + | 
|  | 36 | + | 
|  | 37 | +direct_register_custom_op( | 
|  | 38 | +    op_name="attention", | 
|  | 39 | +    op_func=silly_attention, | 
|  | 40 | +    mutates_args=["out"], | 
|  | 41 | +    fake_impl=silly_attention_fake, | 
|  | 42 | +    dispatch_key="PrivateUse1", | 
|  | 43 | +    target_lib=silly_lib, | 
|  | 44 | +) | 
|  | 45 | + | 
|  | 46 | + | 
|  | 47 | +@support_torch_compile | 
|  | 48 | +class SillyModel(nn.Module): | 
|  | 49 | + | 
|  | 50 | +    def __init__(self, | 
|  | 51 | +                 *, | 
|  | 52 | +                 vllm_config: VllmConfig, | 
|  | 53 | +                 prefix: str = "", | 
|  | 54 | +                 **kwargs) -> None: | 
|  | 55 | +        super().__init__() | 
|  | 56 | + | 
|  | 57 | +    def forward(self, x: torch.Tensor) -> torch.Tensor: | 
|  | 58 | +        """ | 
|  | 59 | +        Overall effect: | 
|  | 60 | +        x += 1 | 
|  | 61 | +        x[0] += 2 | 
|  | 62 | +        global_counter += 2 | 
|  | 63 | +        """ | 
|  | 64 | +        x = x + 1 | 
|  | 65 | +        x = x + 2 | 
|  | 66 | +        out = torch.empty_like(x) | 
|  | 67 | +        torch.ops.silly.attention(x, x, x, out) | 
|  | 68 | +        x = out | 
|  | 69 | +        x = x - 2 | 
|  | 70 | +        x = x - 1 | 
|  | 71 | +        out = torch.empty_like(x) | 
|  | 72 | +        torch.ops.silly.attention(x, x, x, out) | 
|  | 73 | +        x = out | 
|  | 74 | +        x = x + 1 | 
|  | 75 | +        return x | 
|  | 76 | + | 
|  | 77 | + | 
|  | 78 | +@pytest.mark.skipif(True, reason="requires unreleased components") | 
|  | 79 | +def test_simple_piecewise_compile(): | 
|  | 80 | + | 
|  | 81 | +    vllm_config = VllmConfig(compilation_config=CompilationConfig( | 
|  | 82 | +        level=CompilationLevel.PIECEWISE, | 
|  | 83 | +        use_inductor=False, | 
|  | 84 | +        use_cudagraph=True, | 
|  | 85 | +        splitting_ops=["silly.attention"], | 
|  | 86 | +        cudagraph_copy_inputs=True, | 
|  | 87 | +        cudagraph_capture_sizes=[1, 2], | 
|  | 88 | +    )) | 
|  | 89 | +    vllm_config.compilation_config.pass_config.enable_fusion = False | 
|  | 90 | +    with set_current_vllm_config(vllm_config): | 
|  | 91 | +        model = SillyModel(vllm_config=vllm_config, prefix="") | 
|  | 92 | + | 
|  | 93 | +    inputs = torch.randn(100).npu() | 
|  | 94 | + | 
|  | 95 | +    with compilation_counter.expect( | 
|  | 96 | +            num_graphs_seen=1,  # one graph for the model | 
|  | 97 | +            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1 | 
|  | 98 | +            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers | 
|  | 99 | +            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen | 
|  | 100 | +            num_cudagraph_caputured= | 
|  | 101 | +            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen | 
|  | 102 | +    ): | 
|  | 103 | + | 
|  | 104 | +        model(inputs) | 
|  | 105 | + | 
|  | 106 | +        model(torch.randn(2).npu()) | 
|  | 107 | +        model(torch.randn(1).npu()) | 
|  | 108 | + | 
|  | 109 | +        input = torch.zeros(2).npu() | 
|  | 110 | +        global global_counter | 
|  | 111 | +        global_counter = 0 | 
|  | 112 | +        output = model(input) | 
|  | 113 | +        assert global_counter == 2 | 
|  | 114 | +        assert torch.allclose(output.cpu(), torch.tensor([3.0, 1.0])) | 
|  | 115 | + | 
|  | 116 | + | 
|  | 117 | +if __name__ == "__main__": | 
|  | 118 | +    test_simple_piecewise_compile() | 
0 commit comments