From 725429eb8b811a30f2d76886c22d6124a84d0a6b Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Tue, 22 Jul 2025 15:14:22 -0700 Subject: [PATCH] Fix use_irope not set correctly in V1 Signed-off-by: Yong Hoon Shin --- vllm/attention/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1b80fa19d54f..178453ecdc4e 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -143,6 +143,8 @@ def __init__( # the backends) if envs.VLLM_USE_V1: self.use_irope = extra_impl_args.pop("use_irope", False) + else: + self.use_irope = extra_impl_args.get("use_irope", False) quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None @@ -177,7 +179,6 @@ def __init__( kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype - self.use_irope = extra_impl_args.get("use_irope", False) # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant