diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1b80fa19d54f..178453ecdc4e 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -143,6 +143,8 @@ def __init__( # the backends) if envs.VLLM_USE_V1: self.use_irope = extra_impl_args.pop("use_irope", False) + else: + self.use_irope = extra_impl_args.get("use_irope", False) quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None @@ -177,7 +179,6 @@ def __init__( kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype - self.use_irope = extra_impl_args.get("use_irope", False) # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant