| 
 | 1 | +# SPDX-License-Identifier: Apache-2.0  | 
 | 2 | +"""  | 
 | 3 | +Test the piecewise compilation with a simple model so that we  | 
 | 4 | +can exactly calculate the expected output and side effects.  | 
 | 5 | +"""  | 
 | 6 | + | 
 | 7 | +import torch  | 
 | 8 | +from torch import nn  | 
 | 9 | +from torch.library import Library  | 
 | 10 | +from vllm.compilation.counter import compilation_counter  | 
 | 11 | +from vllm.compilation.decorators import support_torch_compile  | 
 | 12 | +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,  | 
 | 13 | +                         set_current_vllm_config)  | 
 | 14 | +from vllm.utils import direct_register_custom_op  | 
 | 15 | + | 
 | 16 | + | 
 | 17 | +global_counter = 0  | 
 | 18 | + | 
 | 19 | +# create a library to hold the custom op  | 
 | 20 | +silly_lib = Library("silly", "FRAGMENT")  # noqa  | 
 | 21 | + | 
 | 22 | + | 
 | 23 | +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,  | 
 | 24 | +                    out: torch.Tensor) -> None:  | 
 | 25 | +    global global_counter  | 
 | 26 | +    global_counter += 1  | 
 | 27 | +    print(f"{global_counter=}")  | 
 | 28 | +    out.copy_(q)  | 
 | 29 | +    out[0] += 1  | 
 | 30 | + | 
 | 31 | + | 
 | 32 | +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,  | 
 | 33 | +                         out: torch.Tensor) -> None:  | 
 | 34 | +    return  | 
 | 35 | + | 
 | 36 | + | 
 | 37 | +direct_register_custom_op(  | 
 | 38 | +    op_name="attention",  | 
 | 39 | +    op_func=silly_attention,  | 
 | 40 | +    mutates_args=["out"],  | 
 | 41 | +    fake_impl=silly_attention_fake,  | 
 | 42 | +    dispatch_key="PrivateUse1",  | 
 | 43 | +    target_lib=silly_lib,  | 
 | 44 | +)  | 
 | 45 | + | 
 | 46 | + | 
 | 47 | +@support_torch_compile  | 
 | 48 | +class SillyModel(nn.Module):  | 
 | 49 | + | 
 | 50 | +    def __init__(self,  | 
 | 51 | +                 *,  | 
 | 52 | +                 vllm_config: VllmConfig,  | 
 | 53 | +                 prefix: str = "",  | 
 | 54 | +                 **kwargs) -> None:  | 
 | 55 | +        super().__init__()  | 
 | 56 | + | 
 | 57 | +    def forward(self, x: torch.Tensor) -> torch.Tensor:  | 
 | 58 | +        """  | 
 | 59 | +        Overall effect:  | 
 | 60 | +        x += 1  | 
 | 61 | +        x[0] += 2  | 
 | 62 | +        global_counter += 2  | 
 | 63 | +        """  | 
 | 64 | +        x = x + 1  | 
 | 65 | +        x = x + 2  | 
 | 66 | +        out = torch.empty_like(x)  | 
 | 67 | +        torch.ops.silly.attention(x, x, x, out)  | 
 | 68 | +        x = out  | 
 | 69 | +        x = x - 2  | 
 | 70 | +        x = x - 1  | 
 | 71 | +        out = torch.empty_like(x)  | 
 | 72 | +        torch.ops.silly.attention(x, x, x, out)  | 
 | 73 | +        x = out  | 
 | 74 | +        x = x + 1  | 
 | 75 | +        return x  | 
 | 76 | + | 
 | 77 | + | 
 | 78 | +def test_simple_piecewise_compile():  | 
 | 79 | + | 
 | 80 | +    vllm_config = VllmConfig(compilation_config=CompilationConfig(  | 
 | 81 | +        level=CompilationLevel.PIECEWISE,  | 
 | 82 | +        use_inductor=False,  | 
 | 83 | +        use_cudagraph=True,  | 
 | 84 | +        splitting_ops=["silly.attention"],  | 
 | 85 | +        cudagraph_copy_inputs=True,  | 
 | 86 | +        cudagraph_capture_sizes=[1, 2],  | 
 | 87 | +    ))  | 
 | 88 | +    vllm_config.compilation_config.pass_config.enable_fusion = False  | 
 | 89 | +    with set_current_vllm_config(vllm_config):  | 
 | 90 | +        model = SillyModel(vllm_config=vllm_config, prefix="")  | 
 | 91 | + | 
 | 92 | +    inputs = torch.randn(100).npu()  | 
 | 93 | + | 
 | 94 | +    with compilation_counter.expect(  | 
 | 95 | +            num_graphs_seen=1,  # one graph for the model  | 
 | 96 | +            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1  | 
 | 97 | +            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers  | 
 | 98 | +            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen  | 
 | 99 | +            num_cudagraph_caputured=  | 
 | 100 | +            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen  | 
 | 101 | +    ):  | 
 | 102 | + | 
 | 103 | +        model(inputs)  | 
 | 104 | + | 
 | 105 | +        model(torch.randn(2).npu())  | 
 | 106 | +        model(torch.randn(1).npu())  | 
 | 107 | + | 
 | 108 | +        input = torch.zeros(2).npu()  | 
 | 109 | +        global global_counter  | 
 | 110 | +        global_counter = 0  | 
 | 111 | +        output = model(input)  | 
 | 112 | +        assert global_counter == 2  | 
 | 113 | +        assert torch.allclose(output.cpu(), torch.tensor([3.0, 1.0]))  | 
 | 114 | + | 
 | 115 | + | 
 | 116 | +if __name__ == "__main__":  | 
 | 117 | +    test_simple_piecewise_compile()  | 
0 commit comments