@@ -1111,41 +1111,41 @@ def load_model(self) -> None:
11111111 with DeviceMemoryProfiler (self .device ) as m :
11121112 time_before_load = time .perf_counter ()
11131113 self .model = get_model (vllm_config = self .vllm_config )
1114- if self .lora_config :
1115- assert supports_lora (
1116- self .model
1117- ), f"{ self .model .__class__ .__name__ } does not support LoRA yet."
1118-
1119- if supports_multimodal (self .model ):
1120- logger .warning (
1121- "Regarding multimodal models, vLLM currently "
1122- "only supports adding LoRA to language model." )
1123- # It's necessary to distinguish between the
1124- # max_position_embeddings of VLMs and LLMs.
1125- if hasattr (self .model .config , "max_position_embeddings" ):
1126- max_pos_embeddings = (
1127- self .model .config .max_position_embeddings )
1128- else :
1129- max_pos_embeddings = (
1130- self .model .config .text_config .max_position_embeddings )
1131-
1132- self .lora_manager = LRUCacheWorkerLoRAManager (
1133- self .scheduler_config .max_num_seqs ,
1134- self .scheduler_config .max_num_batched_tokens ,
1135- self .vocab_size ,
1136- self .lora_config ,
1137- self .device ,
1138- self .model .embedding_modules ,
1139- self .model .embedding_padding_modules ,
1140- max_position_embeddings = max_pos_embeddings ,
1141- )
1142- self .model = self .lora_manager .create_lora_manager (self .model )
11431114 time_after_load = time .perf_counter ()
11441115
11451116 self .model_memory_usage = m .consumed_memory
11461117 logger .info ("Model loading took %.4f GB and %.6f seconds" ,
11471118 self .model_memory_usage / float (2 ** 30 ),
11481119 time_after_load - time_before_load )
1120+
1121+ if self .lora_config :
1122+ assert supports_lora (
1123+ self .model
1124+ ), f"{ self .model .__class__ .__name__ } does not support LoRA yet."
1125+
1126+ if supports_multimodal (self .model ):
1127+ logger .warning ("Regarding multimodal models, vLLM currently "
1128+ "only supports adding LoRA to language model." )
1129+ # It's necessary to distinguish between the max_position_embeddings
1130+ # of VLMs and LLMs.
1131+ if hasattr (self .model .config , "max_position_embeddings" ):
1132+ max_pos_embeddings = self .model .config .max_position_embeddings
1133+ else :
1134+ max_pos_embeddings = (
1135+ self .model .config .text_config .max_position_embeddings )
1136+
1137+ self .lora_manager = LRUCacheWorkerLoRAManager (
1138+ self .scheduler_config .max_num_seqs ,
1139+ self .scheduler_config .max_num_batched_tokens ,
1140+ self .vocab_size ,
1141+ self .lora_config ,
1142+ self .device ,
1143+ self .model .embedding_modules ,
1144+ self .model .embedding_padding_modules ,
1145+ max_position_embeddings = max_pos_embeddings ,
1146+ )
1147+ self .model = self .lora_manager .create_lora_manager (self .model )
1148+
11491149 if self .prompt_adapter_config :
11501150 self .prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager (
11511151 self .scheduler_config .max_num_seqs ,
0 commit comments