vllm-project
diff --git a/‎tests/v1/generation/test_batch_invariance.py‎
Lines changed: 245 additions & 74 deletions b/‎tests/v1/generation/test_batch_invariance.py‎
Lines changed: 245 additions & 74 deletions
diff --git a/‎vllm/config/model.py‎
Lines changed: 8 additions & 0 deletions b/‎vllm/config/model.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 7 additions & 0 deletions b/‎vllm/config/parallel.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎vllm/config/scheduler.py‎
Lines changed: 8 additions & 0 deletions b/‎vllm/config/scheduler.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/distributed/device_communicators/all_reduce_utils.py‎
Lines changed: 6 additions & 0 deletions b/‎vllm/distributed/device_communicators/all_reduce_utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎vllm/distributed/device_communicators/symm_mem.py‎
Lines changed: 5 additions & 0 deletions b/‎vllm/distributed/device_communicators/symm_mem.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 3 additions & 2 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 3 additions & 2 deletions
@@ -426,6 +426,14 @@ def __post_init__(
         skip_mm_profiling: Optional[bool],
         video_pruning_rate: Optional[float],
     ) -> None:
+        # Enable batch invariance settings if requested
+        from vllm.model_executor.layers.batch_invariant import (
+            vllm_kernel_override_batch_invariant,
+        )
+
+        if vllm_kernel_override_batch_invariant():
+            self.enforce_eager = True
+
         # Set the default seed to 0 in V1.
         # NOTE(woosuk): In V0, we set the default seed to None because the
         # driver worker shares the same process as the user process, and thus
 
@@ -531,8 +531,15 @@ def use_ray(self) -> bool:
     def _verify_args(self) -> Self:
         # Lazy import to avoid circular import
         from vllm.executor.executor_base import ExecutorBase
+        from vllm.model_executor.layers.batch_invariant import (
+            vllm_kernel_override_batch_invariant,
+        )
         from vllm.platforms import current_platform
 
+        # Enable batch invariance settings if requested
+        if vllm_kernel_override_batch_invariant():
+            self.disable_custom_all_reduce = True
+
         if (
             self.distributed_executor_backend is not None
             and not isinstance(self.distributed_executor_backend, str)
 
@@ -170,12 +170,20 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self, is_encoder_decoder: bool) -> None:
+        from vllm.model_executor.layers.batch_invariant import (
+            vllm_kernel_override_batch_invariant,
+        )
+
         if self.max_model_len is None:
             self.max_model_len = 8192
 
         if self.max_num_seqs is None:
             self.max_num_seqs = 128
 
+        # Enable batch invariance settings if requested
+        if vllm_kernel_override_batch_invariant():
+            self.enable_chunked_prefill = False
+
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
 
@@ -70,6 +70,12 @@ def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor)
     from vllm.distributed.device_communicators.pynccl_allocator import (
         is_symmetric_memory_enabled,
     )
+    from vllm.model_executor.layers.batch_invariant import (
+        vllm_kernel_override_batch_invariant,
+    )
+
+    if vllm_kernel_override_batch_invariant():
+        return False
 
     if not is_symmetric_memory_enabled():
         return False
 
@@ -10,6 +10,9 @@
     SYMM_MEM_ALL_REDUCE_MAX_SIZES,
 )
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_kernel_override_batch_invariant,
+)
 from vllm.platforms import current_platform
 
 try:
@@ -96,6 +99,8 @@ def __init__(
             return
         self.force_multimem = force_multimem
         self.disabled = False
+        if vllm_kernel_override_batch_invariant():
+            self.disabled = True
 
     def should_use_symm_mem(self, inp: torch.Tensor):
         if self.disabled:
 
@@ -1686,11 +1686,12 @@ def _set_default_args(
     ) -> None:
         """Set Default Arguments for V1 Engine."""
 
-        # V1 always uses chunked prefills and prefix caching
+        # V1 uses chunked prefills and prefix caching by default
         # for non-pooling tasks.
         # For pooling tasks the default is False
         if model_config.runner_type != "pooling":
-            self.enable_chunked_prefill = True
+            if self.enable_chunked_prefill is None:
+                self.enable_chunked_prefill = True
 
             # TODO: When prefix caching supports prompt embeds inputs, this
             # check can be removed.