vllm-project
diff --git a/‎tests/compile/test_sequence_parallelism.py‎
Lines changed: 121 additions & 134 deletions b/‎tests/compile/test_sequence_parallelism.py‎
Lines changed: 121 additions & 134 deletions
@@ -5,9 +5,8 @@
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.fx_utils import find_auto_fn
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
@@ -27,6 +26,7 @@
     initialize_model_parallel,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables
@@ -43,165 +43,147 @@
 ]
 
 
-class TestModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormModel(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size))
-        )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
 
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
+    def forward(self, x):
+        z = torch.relu(x)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
 
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
+        z2 = torch.mm(y, self.w[0])
+        x2 = tensor_model_parallel_all_reduce(z2)
 
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
+        y2, resid = self.norm[1](x2, resid)
 
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
+        z3 = torch.mm(y2, self.w[1])
+        x3 = tensor_model_parallel_all_reduce(z3)
 
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
+        y3, resid = self.norm[2](x3, resid)
 
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
+        z4 = torch.mm(y3, self.w[2])
+        x4 = tensor_model_parallel_all_reduce(z4)
 
-        return norm_output, residual_output
+        y4, resid = self.norm[3](x4, resid)
+        return y4
 
     def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
 
     def ops_in_model_after(self):
         return [
-            torch.ops.vllm.reduce_scatter.default,
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
         ]
 
     def ops_in_model(self):
-        return [torch.ops._C.fused_add_rms_norm.default]
+        if RMSNorm.enabled():
+            return [
+                torch.ops._C.rms_norm.default,
+                torch.ops._C.fused_add_rms_norm.default,
+            ]
+        else:
+            return []
 
 
-class TestQuantModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
+        self.vllm_config = get_current_vllm_config()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size)), requires_grad=False
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size)
+            .to(dtype=current_platform.fp8_dtype())
+            .t()
+            for _ in range(3)
+        ]
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True,
+            act_quant_group_shape=GroupShape.PER_TENSOR,
         )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
-
-        self.fp8_linear = Fp8LinearOp(act_quant_static=True)
-
-        self.scale = torch.rand(1, dtype=torch.float32)
-        # Create a weight that is compatible with torch._scaled_mm,
-        # which expects a column-major layout.
-        self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
-        self.wscale = torch.rand(1, dtype=torch.float32)
-
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
-
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
-
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
-
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
-
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
-
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
-
-        # scaled_mm with static input quantization
-        fp8_linear_result = self.fp8_linear.apply(
-            norm_output,
-            self.w,
-            self.wscale,
-            input_scale=self.scale.to(norm_output.device),
+
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        z2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
         )
 
-        return fp8_linear_result, residual_output
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
 
-    def ops_in_model_before(self):
-        ops_to_remove = [torch.ops.vllm.all_reduce.default]  # Always removed by SP
-        # The following are only removed if fusion happens
-        config = get_current_vllm_config()
-        if config.compilation_config.pass_config.enable_fusion:
-            ops_to_remove.append(torch.ops._C.fused_add_rms_norm.default)
-            # Only check for static_scaled_fp8_quant if custom quant_fp8 is enabled
-            if "+quant_fp8" in config.compilation_config.custom_ops:
-                ops_to_remove.append(torch.ops._C.static_scaled_fp8_quant.default)
-        return ops_to_remove
+        z3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
+
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+
+        z4 = self.fp8_linear.apply(
+            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
+        )
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
-        ops_to_add = [
-            torch.ops.vllm.reduce_scatter.default,
+        return [
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
+        ]
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
         ]
-        # The following is only added if fusion happens and custom quant_fp8 is enabled
-        config = get_current_vllm_config()
-        if (
-            config.compilation_config.pass_config.enable_fusion
-            and "+quant_fp8" in config.compilation_config.custom_ops
-        ):
-            ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
-        return ops_to_add
 
     def ops_in_model(self):
-        config = get_current_vllm_config()
-        if config.compilation_config.pass_config.enable_fusion:
-            # If fusion happens with custom quant_fp8, the fused op is the one
-            # we check for (de)functionalization
+        if self.vllm_config.compilation_config.pass_config.enable_fusion:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
-        else:
-            # If no fusion or using native quant, the original ops are checked
+        elif RMSNorm.enabled():
             return [
                 torch.ops._C.fused_add_rms_norm.default,
-                # TODO  functionalization pass does not handle this yet
-                # torch.ops._C.static_scaled_fp8_quant.default,
             ]
+        elif self.fp8_linear.quant_fp8.enabled():
+            return [
+                torch.ops._C.static_scaled_fp8_quant.default,
+            ]
+        else:
+            return []
 
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "test_model_cls, custom_ops",
     [
-        (TestModel, ""),
-        (TestQuantModel, "+quant_fp8"),
-        (TestQuantModel, "-quant_fp8"),
+        (TestAllReduceRMSNormModel, "+rms_norm"),
+        (TestAllReduceRMSNormModel, "-rms_norm"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
     ],
 )
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.parametrize("dynamic", [False, True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_sequence_parallelism_pass(
     test_model_cls: type[torch.nn.Module],
@@ -211,6 +193,7 @@ def test_sequence_parallelism_pass(
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     num_processes = 2
 
@@ -228,6 +211,7 @@ def run_torch_spawn(fn, nprocs):
                 hidden_size,
                 dtype,
                 enable_fusion,
+                dynamic,
             ),
             nprocs=nprocs,
         )
@@ -245,6 +229,7 @@ def sequence_parallelism_pass_on_test_model(
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     current_platform.seed_everything(0)
 
@@ -295,7 +280,6 @@ def sequence_parallelism_pass_on_test_model(
     with set_current_vllm_config(vllm_config):
         noop_pass = NoOpEliminationPass(vllm_config)
         sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
-        func_pass = FixFunctionalizationPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
         assert (
             sequence_parallelism_pass.compilation_config.splitting_ops
@@ -316,38 +300,41 @@ def sequence_parallelism_pass_on_test_model(
 
         passes_for_backend.append(cleanup_pass)
 
-        backend_no_func = TestBackend(*passes_for_backend)
-        backend_func = TestBackend(*passes_for_backend, func_pass)
+        backend = TestBackend(*passes_for_backend)
 
-        model = test_model_cls(hidden_size, hidden_size * 2)
+        model = test_model_cls(hidden_size)
 
         hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
-        residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
 
-        compiled_model_no_func = torch.compile(model, backend=backend_no_func)
-        compiled_model_no_func(hidden_states, residual)
-        compiled_model_func = torch.compile(model, backend=backend_func)
-        compiled_model_func(hidden_states, residual)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)
 
-        assert sequence_parallelism_pass.matched_count == 1
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
+
+        assert sequence_parallelism_pass.matched_count == 4
 
         # In pre-nodes, all reduce should be there,
         # reduce scatter and all gather should not
-        backend_no_func.check_before_ops(model.ops_in_model_before())
+        pre_ops = [
+            node.target
+            for node in backend.graph_pre_pass.nodes
+            if node.op == "call_function"
+        ]
+        for op in model.ops_in_model_before():
+            num_op = len([pre_op for pre_op in pre_ops if pre_op == op])
+            assert num_op == 4
 
         # In post-nodes, reduce scatter and all gather should be there,
         # all reduce should not
-        backend_no_func.check_after_ops(model.ops_in_model_after())
+        post_ops = [
+            node.target
+            for node in backend.graph_post_pass.nodes
+            if node.op == "call_function"
+        ]
+        for op in model.ops_in_model_after():
+            num_op = len([post_op for post_op in post_ops if post_op == op])
+            assert num_op == 4
 
-        # check if the functionalization pass is applied
         for op in model.ops_in_model():
-            find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-            assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
-
-        # make sure the ops were all de-functionalized
-        found = dict()
-        for node in backend_func.graph_post_pass.nodes:
-            for op in model.ops_in_model():
-                if is_func(node, op):
-                    found[op] = True
-        assert all(found[op] for op in model.ops_in_model())
+            find_auto_fn(backend.graph_post_pass.nodes, op)