from transformers import AutoModelForCausalLM, AutoTokenizer import torch import time import os # os.environ['TORCH_DISABLE_FLASH_ATTENTION_2'] = '0' model_name = "./models/Qwen2.5-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", # attn_implementation="flash_attention_2", device_map="cuda:0" ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_name) prompt = "Give me a short introduction to large language model." messages = [ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, {"role": "user", "content": prompt*300} #用于测试不同长度是否炸显存以及耗时 ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) text_list = [text for _ in range(40)] #模拟40并发 model_dummy_inputs = tokenizer([text], return_tensors="pt").to(model.device) #不攒batch单输入 #由于我只需要prompt_logits,所以提取出模型class直接推理 qwen2_model = model.model qwen2_lm_head = model.lm_head #batch_size = 20 的输入 batch_size = 20 model_batch_inputs = tokenizer([text for _ in range(batch_size)], return_tensors="pt").to(model.device) with torch.no_grad(): ## warmup for i in range(2): # import ipdb # ipdb.set_trace() print(i) rslt = qwen2_model.forward( input_ids = model_batch_inputs.input_ids, attention_mask = model_batch_inputs.attention_mask ) s_time = time.time() rslt = qwen2_lm_head(rslt.last_hidden_state) e_time = time.time() print("infer head:",e_time - s_time) ## for循环推理计算耗时 import time s = time.time() for i in range(len(text_list)): print(i) rslt = qwen2_model.forward( input_ids = model_dummy_inputs.input_ids, attention_mask = model_dummy_inputs.attention_mask ) rslt = qwen2_lm_head(rslt.last_hidden_state) e = time.time() print(e-s) ## 攒batch推理计算耗时 s = time.time() for i in range(len(text_list)//batch_size*3): print(i) rslt = qwen2_model.forward( input_ids = model_batch_inputs.input_ids, attention_mask = model_batch_inputs.attention_mask ) rslt = qwen2_lm_head(rslt.last_hidden_state) e = time.time() print(e-s)