vllm-project
diff --git a/‎vllm/compilation/backends.py‎
Lines changed: 144 additions & 81 deletions b/‎vllm/compilation/backends.py‎
Lines changed: 144 additions & 81 deletions
diff --git a/‎vllm/compilation/collective_fusion.py‎
Lines changed: 327 additions & 163 deletions b/‎vllm/compilation/collective_fusion.py‎
Lines changed: 327 additions & 163 deletions
diff --git a/‎vllm/compilation/compiler_interface.py‎
Lines changed: 18 additions & 17 deletions b/‎vllm/compilation/compiler_interface.py‎
Lines changed: 18 additions & 17 deletions
diff --git a/‎vllm/compilation/cuda_piecewise_backend.py‎
Lines changed: 38 additions & 41 deletions b/‎vllm/compilation/cuda_piecewise_backend.py‎
Lines changed: 38 additions & 41 deletions
diff --git a/‎vllm/compilation/inductor_pass.py‎
Lines changed: 6 additions & 5 deletions b/‎vllm/compilation/inductor_pass.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎vllm/compilation/pass_manager.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/compilation/pass_manager.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/compilation/sequence_parallelism.py‎
Lines changed: 5 additions & 2 deletions b/‎vllm/compilation/sequence_parallelism.py‎
Lines changed: 5 additions & 2 deletions
@@ -64,16 +64,17 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
+        compile_range: Optional[tuple[int, int]] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
         """
         Compile the graph with the given example inputs and compiler config,
-        with a runtime shape. If the `runtime_shape` is None, it means
+        with a runtime shape. If the `compile_range` is None, it means
         the `example_inputs` have a dynamic shape. Otherwise, the
-        `runtime_shape` specifies the shape of the inputs. Right now we only
-        support one variable shape for all inputs, which is the batchsize
-        (number of tokens) during inference.
+        `compile_range` specifies the range of the inputs, 
+        it could be concrete size, e.g. (4, 4).
+        Right now we only support one variable shape for all inputs,
+         which is the batchsize (number of tokens) during inference.
 
         Dynamo will make sure `graph(*example_inputs)` is valid.
 
@@ -98,7 +99,7 @@ def load(self,
              graph: fx.GraphModule,
              example_inputs: list[Any],
              graph_index: int,
-             runtime_shape: Optional[int] = None) -> Callable:
+             compile_range: Optional[tuple[int, int]] = None) -> Callable:
         """
         Load the compiled function from the handle.
         Raises an error if the handle is invalid.
@@ -188,22 +189,22 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
+        compile_range: Optional[tuple[int, int]] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
             current_config.update(compiler_config)
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
 
-        if isinstance(runtime_shape, int):
+        if isinstance(compile_range, tuple):
             dynamic_shapes = "from_example_inputs"
         else:
             dynamic_shapes = "from_tracing_context"
 
         from torch._inductor import standalone_compile
-        with pass_context(runtime_shape):
+        with pass_context(compile_range):
             compiled_graph = standalone_compile(
                 graph,
                 example_inputs,
@@ -223,7 +224,7 @@ def load(self,
              graph: fx.GraphModule,
              example_inputs: list[Any],
              graph_index: int,
-             runtime_shape: Optional[int] = None) -> Callable:
+             compile_range: Optional[tuple[int, int]] = None) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
         assert isinstance(handle[1], str)
@@ -283,7 +284,7 @@ def compile(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
+        compile_range: Optional[tuple[int, int]] = None,
         key: Optional[str] = None,
     ) -> tuple[Optional[Callable], Optional[Any]]:
         compilation_counter.num_inductor_compiles += 1
@@ -296,7 +297,7 @@ def compile(
         current_config["fx_graph_cache"] = True
         current_config["fx_graph_remote_cache"] = False
 
-        set_inductor_config(current_config, runtime_shape)
+        set_inductor_config(current_config, compile_range)
 
         # inductor can inplace modify the graph, so we need to copy it
         # see https://github.com/pytorch/pytorch/issues/138980
@@ -433,7 +434,7 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
                     torch._functorch.config.patch(
                         enable_remote_autograd_cache=False))
 
-            with pass_context(runtime_shape):
+            with pass_context(compile_range):
                 compiled_graph = compile_fx(
                     graph,
                     example_inputs,
@@ -547,9 +548,9 @@ def metrics_context(self) -> contextlib.AbstractContextManager:
             return contextlib.nullcontext()
 
 
-def set_inductor_config(config, runtime_shape):
-    if isinstance(runtime_shape, int):
-        # for a specific batchsize, tuning triton kernel parameters
+def set_inductor_config(config, compile_range):
+    if isinstance(compile_range, tuple):
+        # for a specific range of batchsizes, tuning triton kernel parameters
         # can be beneficial
         config["max_autotune"] = True
         config["coordinate_descent_tuning"] = True
 
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from typing import Any, Callable
+from typing import Any, Callable, Optional
 
 import torch.fx as fx
 
@@ -11,17 +11,15 @@
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from typing import Optional
 
 logger = init_logger(__name__)
 
+
 @dataclasses.dataclass
-class ConditionalEntry:
-    runtime_shape: int
+class RangeEntry:
+    compile_range: Optional[tuple[int, int]]
     compiled: bool = False
     runnable: Callable = None  # type: ignore
-    runtime_range: Optional[tuple[int,
-                                  int]] = None  # only used for range entries
 
 
 class PiecewiseBackend:
@@ -55,9 +53,25 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
 
         self.compile_sizes: set[int] = set(
             self.compilation_config.compile_sizes)
-        self.compile_ranges: tuple[
-            int, int] = self.compilation_config.compile_ranges
-        self.is_in_range = lambda x, range: range[0] <= x <= range[1]
+        self.compile_ranges_split_points: list[
+            int] = self.compilation_config.compile_ranges_split_points
+        self.compile_ranges = []
+        split_points = sorted(
+            set(self.compile_sizes).union(set(
+                self.compile_ranges_split_points)))
+        for i, s in enumerate(split_points):
+            if i == 0:
+                self.compile_ranges.append((1, s))
+            else:
+                self.compile_ranges.append((split_points[i - 1], s))
+            if s in self.compile_sizes:
+                self.compile_ranges.append((s, s))
+        self.compile_ranges = sorted(self.compile_ranges)
+        logger.debug("PiecewiseBackend: compile_ranges: %s",
+                     self.compile_ranges)
+
+        self.is_in_range = lambda x, range: range[0] <= x < range[1] if range[
+            0] < range[1] else x == range[0]
 
         self.first_run_finished = False
 
@@ -68,28 +82,26 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
 
         # the entries for different shapes that we need to compile
-        self.concrete_size_entries: dict[int, ConditionalEntry] = {}
+        # self.concrete_size_entries: dict[int, RangeEntry] = {}
 
         # the entries for ranges that we need to either
         # TODO: we should merge with concrete_size_entries
-        self.range_entries: dict[tuple[int, int], ConditionalEntry] = {}
+        self.range_entries: dict[tuple[int, int], RangeEntry] = {}
 
-        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # to_be_compiled_ranges tracks the remaining ranges to compile,
         # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
         self.to_be_compiled_ranges: set[tuple[int,
                                               int]] = set(self.compile_ranges)
 
         # We only keep compilation management inside this class directly.
-        for shape in self.compile_sizes:
-            self.concrete_size_entries[shape] = ConditionalEntry(
-                runtime_shape=shape,
+        for range in self.compile_ranges:
+            self.range_entries[range] = RangeEntry(
+                compile_range=range,
                 runnable=self.compiled_graph_for_general_shape,
             )
 
     def check_for_ending_compilation(self):
-        if (self.is_last_graph and not self.to_be_compiled_sizes
-                and not self.to_be_compiled_ranges):
+        if (self.is_last_graph and not self.to_be_compiled_ranges):
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
             self.vllm_backend.compiler_manager.save_to_file()
@@ -103,47 +115,32 @@ def __call__(self, *args) -> Any:
 
         runtime_shape = args[self.sym_shape_indices[0]]
 
-
         range_entry = None
         for range in self.compile_ranges:
             if self.is_in_range(runtime_shape, range):
-                if range not in self.range_entries:
-                    self.range_entries[range] = ConditionalEntry(
-                        runtime_shape=runtime_shape,
-                        runtime_range=range,
-                    )
                 range_entry = self.range_entries[range]
                 break
 
-        if (runtime_shape not in self.concrete_size_entries
-                and range_entry is None):
+        if (range_entry is None):
             # we don't need to do anything for this shape
             return self.compiled_graph_for_general_shape(*args)
 
-        if range_entry is not None:
-            entry = range_entry
-        else:
-            entry = self.concrete_size_entries[runtime_shape]
+        if not range_entry.compiled:
+            range_entry.compiled = True
+            self.to_be_compiled_ranges.remove(range_entry.compile_range)
 
-        if not entry.compiled:
-            entry.compiled = True
-            if range_entry is not None:
-                self.to_be_compiled_ranges.remove(range_entry.runtime_range)
-            else:
-                self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
-            entry.runnable = self.vllm_backend.compiler_manager.compile(
+            range_entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape)
+                compile_range=range_entry.compile_range)
 
             # finished compilations for all required shapes
-            if (self.is_last_graph and not self.to_be_compiled_sizes
-                    and not self.to_be_compiled_ranges):
+            if (self.is_last_graph and not self.to_be_compiled_ranges):
                 self.check_for_ending_compilation()
 
-        return entry.runnable(*args)
+        return range_entry.runnable(*args)
@@ -25,8 +25,8 @@
 
 class PassContext:
 
-    def __init__(self, runtime_shape: Optional[int]):
-        self.runtime_shape = runtime_shape
+    def __init__(self, compile_range: Optional[tuple[int, int]]):
+        self.compile_range = compile_range
 
 
 def get_pass_context() -> PassContext:
@@ -36,13 +36,13 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(runtime_shape: Optional[int]):
+def pass_context(compile_range: Optional[tuple[int, int]]):
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
     global _pass_context
     prev_context = _pass_context
-    _pass_context = PassContext(runtime_shape)
+    _pass_context = PassContext(compile_range)
     try:
         yield
     finally:
@@ -93,7 +93,8 @@ def hash_dict(dict_: dict[Any, Any]):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable_for_shape(self, shape: Optional[int]):
+    def is_applicable_for_range(self, compile_range: Optional[tuple[int,
+                                                                    int]]):
         return True
 
 
 
@@ -43,9 +43,9 @@ def __init__(self):
         self.passes: list[VllmInductorPass] = []
 
     def __call__(self, graph: fx.Graph):
-        shape = get_pass_context().runtime_shape
+        compile_range = get_pass_context().compile_range
         for pass_ in self.passes:
-            if pass_.is_applicable_for_shape(shape):
+            if pass_.is_applicable_for_range(compile_range):
                 pass_(graph)
 
         # always run fix_functionalization last
 
@@ -469,9 +469,12 @@ def __init__(self, config: VllmConfig):
             # and allow multiple values of epsilon.
             torch._inductor.pattern_matcher._seen_patterns.clear()
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+    def is_applicable_for_range(
+            self, compile_range: Optional[tuple[int, int]]) -> bool:
         tp_size = get_tensor_model_parallel_world_size()
-        return shape is not None and shape % tp_size == 0
+        return compile_range is not None and (
+            compile_range[0]
+            == compile_range[1]) and (compile_range[1] % tp_size == 0)
 
     def __call__(self, graph: fx.Graph):
         self.begin()