@@ -1111,41 +1111,41 @@ def load_model(self) -> None:
11111111 with DeviceMemoryProfiler (self .device ) as m :
11121112 time_before_load = time .perf_counter ()
11131113 self .model = get_model (vllm_config = self .vllm_config )
1114+ if self .lora_config :
1115+ assert supports_lora (
1116+ self .model
1117+ ), f"{ self .model .__class__ .__name__ } does not support LoRA yet."
1118+
1119+ if supports_multimodal (self .model ):
1120+ logger .warning (
1121+ "Regarding multimodal models, vLLM currently "
1122+ "only supports adding LoRA to language model." )
1123+ # It's necessary to distinguish between the
1124+ # max_position_embeddings of VLMs and LLMs.
1125+ if hasattr (self .model .config , "max_position_embeddings" ):
1126+ max_pos_embeddings = (
1127+ self .model .config .max_position_embeddings )
1128+ else :
1129+ max_pos_embeddings = (
1130+ self .model .config .text_config .max_position_embeddings )
1131+
1132+ self .lora_manager = LRUCacheWorkerLoRAManager (
1133+ self .scheduler_config .max_num_seqs ,
1134+ self .scheduler_config .max_num_batched_tokens ,
1135+ self .vocab_size ,
1136+ self .lora_config ,
1137+ self .device ,
1138+ self .model .embedding_modules ,
1139+ self .model .embedding_padding_modules ,
1140+ max_position_embeddings = max_pos_embeddings ,
1141+ )
1142+ self .model = self .lora_manager .create_lora_manager (self .model )
11141143 time_after_load = time .perf_counter ()
11151144
11161145 self .model_memory_usage = m .consumed_memory
11171146 logger .info ("Model loading took %.4f GB and %.6f seconds" ,
11181147 self .model_memory_usage / float (2 ** 30 ),
11191148 time_after_load - time_before_load )
1120-
1121- if self .lora_config :
1122- assert supports_lora (
1123- self .model
1124- ), f"{ self .model .__class__ .__name__ } does not support LoRA yet."
1125-
1126- if supports_multimodal (self .model ):
1127- logger .warning ("Regarding multimodal models, vLLM currently "
1128- "only supports adding LoRA to language model." )
1129- # It's necessary to distinguish between the max_position_embeddings
1130- # of VLMs and LLMs.
1131- if hasattr (self .model .config , "max_position_embeddings" ):
1132- max_pos_embeddings = self .model .config .max_position_embeddings
1133- else :
1134- max_pos_embeddings = (
1135- self .model .config .text_config .max_position_embeddings )
1136-
1137- self .lora_manager = LRUCacheWorkerLoRAManager (
1138- self .scheduler_config .max_num_seqs ,
1139- self .scheduler_config .max_num_batched_tokens ,
1140- self .vocab_size ,
1141- self .lora_config ,
1142- self .device ,
1143- self .model .embedding_modules ,
1144- self .model .embedding_padding_modules ,
1145- max_position_embeddings = max_pos_embeddings ,
1146- )
1147- self .model = self .lora_manager .create_lora_manager (self .model )
1148-
11491149 if self .prompt_adapter_config :
11501150 self .prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager (
11511151 self .scheduler_config .max_num_seqs ,
0 commit comments