toy llam tests passing

morrison-turnansky · morrison-turnansky · commit 3373dd7bfbb3 · 2025-10-02T17:44:34.000Z
Signed-off-by: morrison-turnansky &lt;mturnans@redhat.com&gt;
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -241,14 +241,14 @@ def tractable_computation(input_ids: torch.Tensor,
 @torch.inference_mode
 def run_model(llama_config,
               use_compile: bool,
-              use_inductor: bool,
+              backend: str,
               split_attn: bool = False) -> torch.Tensor:
 
     if use_compile:
         compilation_config = CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
-            use_inductor=use_inductor,
+            backend=backend,
             cudagraph_capture_sizes=[1, 2],
         )
         if split_attn:
@@ -310,8 +310,8 @@ def run_model(llama_config,
             return output.cpu()
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
-def test_toy_llama(use_inductor: bool):
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
+def test_toy_llama(backend: str):
     # compare output with and without piecewise compilation
 
     llama_config = LlamaConfig(hidden_size=128,
@@ -334,10 +334,10 @@ def test_toy_llama(use_inductor: bool):
             num_cudagraph_captured=0,
     ):
         outputs.append(
-            run_model(llama_config, use_inductor=False, use_compile=False))
-    run_model(tractable_config, use_inductor=False, use_compile=False)
+            run_model(llama_config, backend="eager", use_compile=False))
+    run_model(tractable_config, backend="eager", use_compile=False)
 
-    if use_inductor:
+    if backend == "inductor":
         kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
     else:
         kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
@@ -352,10 +352,8 @@ def test_toy_llama(use_inductor: bool):
             **kwargs,
     ):
         outputs.append(
-            run_model(llama_config,
-                      use_inductor=use_inductor,
-                      use_compile=True))
-    run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
+            run_model(llama_config, backend=backend, use_compile=True))
+    run_model(tractable_config, backend=backend, use_compile=True)
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -371,11 +369,11 @@ def test_toy_llama(use_inductor: bool):
     ):
         outputs.append(
             run_model(llama_config,
-                      use_inductor=use_inductor,
+                      backend=backend,
                       use_compile=True,
                       split_attn=True))
     run_model(tractable_config,
-              use_inductor=use_inductor,
+              backend=backend,
               use_compile=True,
               split_attn=True)
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
@@ -57,7 +57,8 @@ class Relu3(ReLUSquaredActivation):
         # All but ReLU3 (even if ReLU2 is on)
         ("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True),
         # RMSNorm and SiluAndMul
-        ("none,-relu3,+rms_norm,+silu_and_mul", 4, "eager", [1, 1, 0, 0], False),
+        ("none,-relu3,+rms_norm,+silu_and_mul", 4, "eager", [1, 1, 0, 0], False
+         ),
         # All but RMSNorm
         ("-rms_norm", 3, "eager", [0, 1, 1, 1], True),
         #
@@ -71,10 +72,8 @@ class Relu3(ReLUSquaredActivation):
 def test_enabled_ops(env: Optional[str], torch_level: int, backend: str,
                      ops_enabled: list[int], default_on: bool):
     custom_ops = env.split(',') if env else []
-    vllm_config = VllmConfig(
-        compilation_config=CompilationConfig(backend=backend,
-                                             level=torch_level,
-                                             custom_ops=custom_ops))
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        backend=backend, level=torch_level, custom_ops=custom_ops))
     with set_current_vllm_config(vllm_config):
 
         assert CustomOp.default_on() == default_on
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -7,6 +7,7 @@
 from dataclasses import asdict, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
+
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
@@ -212,7 +213,8 @@ class CompilationConfig:
     """
     Whether to use inductor compilation.
 
-    This flag is deprecated and will be removed. Please use the 'backend' option instead.
+    This flag is deprecated and will be removed.
+    Please use the 'backend' option instead.
 
     - False: inductor compilation is not used. graph runs in eager
         (custom_ops enabled by default).
@@ -514,17 +516,22 @@ def __post_init__(self, **kwargs) -> None:
                                  "must be 'all', 'none', '+op' or '-op' "
                                  "(where 'op' is the registered op name)")
 
-        # Currently only eager and inductor backend are supported for piecewise compilation. 
-        # Update when more backends are supported.
-        if self.level == CompilationLevel.PIECEWISE and self.backend not in ["", "eager", "inductor"]:
-            raise ValueError(f"Invalid backend for piecewise compilation: {self.backend}")
+        # Currently only eager and inductor backend are supported
+        # for piecewise compilation. Update when more backends are supported.
+        if self.level == CompilationLevel.PIECEWISE and self.backend not in [
+                "", "eager", "inductor"
+        ]:
+            raise ValueError(
+                f"Invalid backend for piecewise compilation: {self.backend}")
+
+        if self.backend == "":
+            self.backend = "inductor"
 
         logger.warning_once(
-            "The 'use_inductor' flag is deprecated and will be removed in a future release. "
-            "Please use the 'backend' option instead.",
-        )
-    
-   
+            "The 'use_inductor' flag is deprecated and will be\
+                 removed in a future release."
+            "Please use the 'backend' option instead.", )
+
     def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         """
         Initialize the backend for the compilation config from a vllm config.
@@ -534,7 +541,10 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
             The backend for the compilation config.
         """
         if self.level is None:
-            raise ValueError("No compilation level is set. This method should only be called via vllm config where the level is set if none is provided.")
+            raise ValueError(
+                "No compilation level is set. This method should only be \
+                called via vllm config where the level is set if none is \
+                provided.")
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -554,8 +564,10 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
             elif self.backend in ["eager", "inductor"]:
                 vllm_config.compilation_config.backend = self.backend
             else:
-                raise ValueError(f"Invalid backend for piecewise compilation: {self.backend}")
-       
+                raise ValueError(
+                    f"Invalid backend for piecewise compilation: {self.backend}"
+                )
+
         assert self.level == CompilationLevel.PIECEWISE
 
         from vllm.compilation.backends import VllmBackend