@@ -1147,20 +1147,15 @@ def create_engine_config(
11471147        else :
11481148            envs .set_vllm_use_v1 (use_v1 )
11491149
1150-         # Set default arguments for V0 or V1 Engine. 
1151-         if  use_v1 :
1152-             self ._set_default_args_v1 (usage_context , model_config )
1153-             # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 
1154-             if  current_platform .is_cpu (
1155-             ) and  current_platform .get_cpu_architecture () in  (
1156-                     CpuArchEnum .POWERPC , CpuArchEnum .S390X , CpuArchEnum .ARM ):
1157-                 logger .info (
1158-                     "Chunked prefill is not supported for ARM and POWER " 
1159-                     "and S390X CPUs; " 
1160-                     "disabling it for V1 backend." )
1161-                 self .enable_chunked_prefill  =  False 
1162-         else :
1163-             self ._set_default_args_v0 (model_config )
1150+         # Set default arguments for V1 Engine. 
1151+         self ._set_default_args (usage_context , model_config )
1152+         # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 
1153+         if  current_platform .is_cpu () and  current_platform .get_cpu_architecture (
1154+         ) in  (CpuArchEnum .POWERPC , CpuArchEnum .S390X , CpuArchEnum .ARM ):
1155+             logger .info ("Chunked prefill is not supported for ARM and POWER " 
1156+                         "and S390X CPUs; " 
1157+                         "disabling it for V1 backend." )
1158+             self .enable_chunked_prefill  =  False 
11641159        assert  self .enable_chunked_prefill  is  not   None 
11651160
11661161        sliding_window : Optional [int ] =  None 
@@ -1528,64 +1523,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
15281523
15291524        return  True 
15301525
1531-     def  _set_default_args_v0 (self , model_config : ModelConfig ) ->  None :
1532-         """Set Default Arguments for V0 Engine.""" 
1533- 
1534-         max_model_len  =  model_config .max_model_len 
1535-         use_long_context  =  max_model_len  >  32768 
1536-         if  self .enable_chunked_prefill  is  None :
1537-             # Chunked prefill not supported for Multimodal or MLA in V0. 
1538-             if  model_config .is_multimodal_model  or  model_config .use_mla :
1539-                 self .enable_chunked_prefill  =  False 
1540- 
1541-             # Enable chunked prefill by default for long context (> 32K) 
1542-             # models to avoid OOM errors in initial memory profiling phase. 
1543-             elif  use_long_context :
1544-                 is_gpu  =  current_platform .is_cuda ()
1545-                 use_sliding_window  =  (model_config .get_sliding_window ()
1546-                                       is  not   None )
1547-                 use_spec_decode  =  self .speculative_config  is  not   None 
1548- 
1549-                 if  (is_gpu  and  not  use_sliding_window  and  not  use_spec_decode 
1550-                         and  not  self .enable_lora ):
1551-                     self .enable_chunked_prefill  =  True 
1552-                     logger .warning (
1553-                         "Chunked prefill is enabled by default for models " 
1554-                         "with max_model_len > 32K. Chunked prefill might " 
1555-                         "not work with some features or models. If you " 
1556-                         "encounter any issues, please disable by launching " 
1557-                         "with --enable-chunked-prefill=False." )
1558- 
1559-             if  self .enable_chunked_prefill  is  None :
1560-                 self .enable_chunked_prefill  =  False 
1561- 
1562-         if  not  self .enable_chunked_prefill  and  use_long_context :
1563-             logger .warning (
1564-                 "The model has a long context length (%s). This may cause" 
1565-                 "OOM during the initial memory profiling phase, or result " 
1566-                 "in low performance due to small KV cache size. Consider " 
1567-                 "setting --max-model-len to a smaller value." , max_model_len )
1568- 
1569-         # Disable prefix caching for multimodal models for VLLM_V0. 
1570-         if  self .enable_prefix_caching  and  model_config .is_multimodal_model :
1571-             logger .warning (
1572-                 "--enable-prefix-caching is not supported for multimodal " 
1573-                 "models in V0 and has been disabled." )
1574-             self .enable_prefix_caching  =  False 
1575- 
1576-             if  self .enable_prompt_embeds :
1577-                 logger .warning (
1578-                     "--enable-prompt-embeds and --enable-prefix-caching " 
1579-                     "are not supported together in V0. Prefix caching has " 
1580-                     "been disabled." )
1581-                 self .enable_prefix_caching  =  False 
1582- 
1583-         # Set max_num_seqs to 256 for VLLM_V0. 
1584-         if  self .max_num_seqs  is  None :
1585-             self .max_num_seqs  =  256 
1586- 
1587-     def  _set_default_args_v1 (self , usage_context : UsageContext ,
1588-                              model_config : ModelConfig ) ->  None :
1526+     def  _set_default_args (self , usage_context : UsageContext ,
1527+                           model_config : ModelConfig ) ->  None :
15891528        """Set Default Arguments for V1 Engine.""" 
15901529
15911530        # V1 always uses chunked prefills and prefix caching 
0 commit comments