performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

qing-xu-intel · 2025-01-07T05:38:24Z

dGPU Arc770
Linux kernel = 6.1.59
Ubuntu = 22.04

python test.py
if load more than one models, the inference latency increase:

if only chatglm3 model loaded: llm infer 0.90 s
if only whisper-small model loaded: wsp infer 0.68 s
if both whisper-small + chatglm3 loaded:
llm infer 1.22 s
wsp infer 1.01 s
if 4 models loaded:
llm infer 2.07 s
cpm infer 2.97 s
sd infer 0.74 s
wsp infer 1.93 s

qing-xu-intel · 2025-01-07T05:39:55Z

import os
import time
import torch
import requests
import librosa
from diffusers import DiffusionPipeline
from PIL import Image
from ipex_llm.transformers import AutoModel
from ipex_llm.transformers import AutoModelForCausalLM
from ipex_llm.optimize import low_memory_init, load_low_bit
from ipex_llm import optimize_model
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import AutoTokenizer, AutoProcessor, WhisperProcessor, AutoModelForSeq2SeqLM

WHISPER_SAMPLING_RATE = 16000

def test_chatglm(llm_model, llm_tokenizer, report, is_report):
with torch.inference_mode():
#warm up=======================
prompt = '问：[INST]请简单地回答。 <>\n\n\n<>\n\n请介绍一下上海 [/INST]\n\n答：'
# tokenize the input prompt from string to token ids;
# with .to('xpu') specifically for inference on Intel GPUs
input_ids = llm_tokenizer.encode(prompt, return_tensors="pt").to('xpu')
st = time.time()
# predict the next tokens (maximum 32) based on the input token ids
output = llm_model.generate(input_ids, max_new_tokens=128)
if is_report:
report.append('llm infer {:.2f} s'.format(time.time()-st))
print(f'==============================llm generate {time.time()-st}')
# decode the predicted token ids to output string

st = time.time() output = output.cpu() output_str = llm_tokenizer.decode(output[0], skip_special_tokens=True) #output = output.cpu() print(f'==============================llm decode {time.time()-st}') print('-'*20, 'Output', '-'*20) print(output_str)

def test_sd(sd_model, report, is_report):
imgidx = 1
save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png"
prompt_en = "a pig flying in the sky"
prompt_en2 = "a horse running in the city"

# output_image = sd_model( # prompt=prompt_en, # num_inference_steps=4, # guidance_scale=1, # num_images_per_prompt=1, # output_type="pil", # width=512, # height=512, # lcm_origin_steps=50, # ).images[0] # # torch.xpu.synchronize() st = time.time() output_image = sd_model( prompt=prompt_en2, num_inference_steps=4, guidance_scale=1, num_images_per_prompt=1, output_type="pil", width=512, height=512, lcm_origin_steps=50, ).images[0] if is_report: report.append('sd infer {:.2f} s'.format(time.time()-st)) print("========== sd generate {:.2f} s".format(time.time()-st)) #0.51s, Arc770, 16GB output_image.save(save_path) print("========== sd save {:.2f} s".format(time.time()-st)) #0.64s, Arc770, 16GB

def test_minicpm(model, tokenizer, report, is_report):
query = "图片里有什么?他们在干什么？"
stream = False
image_path = '/home/rdl/qt_image_2/current.jpg'
if os.path.exists(image_path):
image = Image.open(image_path).convert('RGB')
else:
image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')

# Generate predicted tokens # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md msgs = [{'role': 'user', 'content': [image, query]}] # ipex_llm model needs a warmup, then inference time can be accurate model.chat( image=None, msgs=msgs, tokenizer=tokenizer, ) if stream: res = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, stream=True ) print('-'*20, 'Input Image', '-'*20) print(image_path) print('-'*20, 'Input Prompt', '-'*20) print(query) print('-'*20, 'Stream Chat Output', '-'*20) for new_text in res: print(new_text, flush=True, end='') else: st = time.time() res = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, ) torch.xpu.synchronize() end = time.time() if is_report: report.append('cpm infer {:.2f} s'.format(end-st)) print(f'=========minicpm Inference time: {end-st} s') print('-'*20, 'Input Image', '-'*20) print(image_path) print('-'*20, 'Input Prompt', '-'*20) print(query) print('-'*20, 'Chat Output', '-'*20) print(res)

def test_whisper(whisper_processor, whisper_model, report, is_report):
init_wav_file = os.path.join(os.path.dirname(file), "test_draw-pic-of-horse.wav")
audio, __ = librosa.load(path=init_wav_file, sr=WHISPER_SAMPLING_RATE)
# get_text_from_voice(audio, skip_print=True)

#https://github.com/huggingface/transformers/issues/22395, prompt for whisper simple_zh = "以下是普通话的句子[\INST]" #简体字，否则会出现繁体字 prompt_ids = whisper_processor.get_prompt_ids(simple_zh) forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="chinese", task="transcribe") # Generate predicted tokens output_str = None with torch.inference_mode(): if device == "GPU": input_features = whisper_processor(audio, sampling_rate=WHISPER_SAMPLING_RATE, return_tensors="pt").input_features.half().to('xpu') else: input_features = whisper_processor(audio, sampling_rate=WHISPER_SAMPLING_RATE, return_tensors="pt").input_features st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function # to obtain optimal performance with BigDL-LLM INT4 optimizations predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids) output_str = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True) # output_str = output_str[0] output_str = output_str[0].split('[\INST]')[-1] inf_time = time.time()-st # self._json_output[self._cycle_count] = {"Recognized question: ": output_str, "Inference and decoding time: ": inf_time} # self._metric_report.append(f'== Whisper inference time for {output_str}: {inf_time} s') if is_report: report.append('wsp infer {:.2f} s'.format(inf_time)) print('=============== whisper latency {:.2f} s'.format(inf_time)) print('-'*20, 'Recognized text', '-'*20) print(f'{output_str}') print('-'*40) return output_str

if name == 'main':
device = "GPU"
llm_test = True
sd_test = False
minicpm_test = False
wp_test = False
metric_report = []
test_loop = 10

############# #load whisper ############# if wp_test: st = time.time() whisper_model_path = "/home/rdl/git/whisper-small" # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format if device == "GPU": whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False, use_cache=True) whisper_model.half().to('xpu') else: whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True) whisper_model.config.forced_decoder_ids = None # Load processor whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path) print("============ whisper load to gpu") test_whisper(whisper_processor, whisper_model, metric_report, False) print("============ whisper warm up done!") ############# #load chatglm ############# if llm_test: llm_model_path = "/home/rdl/git/chatglm3-6b" save_directory = './open-chatglm3-3b-v2-ipex-llm-INT4' llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True) if os.path.exists(save_directory): llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True) else: llm_model = AutoModel.from_pretrained(llm_model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True, use_cache=True) llm_model.save_low_bit(save_directory) del(llm_model) llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True) if device == "GPU": llm_model.to('xpu') print("============ llm load to gpu") test_chatglm(llm_model, llm_tokenizer, metric_report, False) print("============ llm warm done!") ########################## #load lcm stable diffusion ########################## if sd_test: sd_model_path = "/home/rdl/git/LCM_Dreamshaper_v7" # autoencoder_model_path = "/home/rdl/git/LCM_Dreamshaper_v7/vae" # sd_model_ = DiffusionPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float32) # imgidx = 1 # pipe.vae = AutoencoderTiny.from_pretrained("./LCM_Dreamshaper_v7/taesd", torch_dtype=torch.float16) #pipe = pipe.to("xpu") # save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png" # prompt_en = "a pig flying in the sky" # prompt_en2 = "a horse running in the city" sd_model = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path=sd_model_path, torch_dtype=torch.float16) sd_model.to("xpu") print("============ sd load to gpu") test_sd(sd_model, metric_report, False) print("============ sd warm done!") ############# #load minicpm ############# if minicpm_test: minicpm_model_path = "/home/rdl/git/MiniCPM-V-2_6" # image_path = '/home/rdl/qt_image_2/current.jpg' minicpm_lowbit_path = "./open-MiniCPM-V-26-ipex-llm-INT4" if not minicpm_lowbit_path or not os.path.exists(minicpm_lowbit_path): # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. minicpm_model = AutoModel.from_pretrained(minicpm_model_path, load_in_low_bit="sym_int4", optimize_model=True, trust_remote_code=True, use_cache=True, modules_to_not_convert=["vpm", "resampler"]) minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_model_path, trust_remote_code=True) else: minicpm_model = AutoModel.load_low_bit(minicpm_lowbit_path, optimize_model=True, trust_remote_code=True, use_cache=True, modules_to_not_convert=["vpm", "resampler"]) minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_lowbit_path, trust_remote_code=True) minicpm_model.eval() if minicpm_model_path and not os.path.exists(minicpm_model_path): minicpm_processor = AutoProcessor.from_pretrained(minicpm_model_path, trust_remote_code=True) minicpm_model.save_low_bit(minicpm_lowbit_path) minicpm_tokenizer.save_pretrained(minicpm_lowbit_path) minicpm_processor.save_pretrained(minicpm_lowbit_path) minicpm_model = minicpm_model.half().to('xpu') print("============ cpm load to gpu") test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, False) print("============ cpm warm up done!") ############# #test ############# i = 0 while i<test_loop: i += 1 print('\n', '=='*20, ' TEST STARTING ', '=='*20) metric_report.append('\n-----------------loop {}'.format(i)) if llm_test: print('\n=================test_chatglm') test_chatglm(llm_model, llm_tokenizer, metric_report, True) if minicpm_test: print('\n=================test_minicpm') test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, True) if sd_test: print('\n=================test_sd') test_sd(sd_model, metric_report,True) if wp_test: print('\n=================test_whisper') test_whisper(whisper_processor, whisper_model, metric_report, True) for rp in metric_report: print(rp)

qing-xu-intel · 2025-01-07T05:46:11Z

Please use below value to run the test cases (if want to run the case, set it to True):
llm_test = True
sd_test = False
minicpm_test = False
wp_test = False

ACupofAir · 2025-01-08T00:33:36Z

import os import time import torch import requests import librosa from diffusers import DiffusionPipeline from PIL import Image from ipex_llm.transformers import AutoModel from ipex_llm.transformers import AutoModelForCausalLM from ipex_llm.optimize import low_memory_init, load_low_bit from ipex_llm import optimize_model from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import AutoTokenizer, AutoProcessor, WhisperProcessor, AutoModelForSeq2SeqLM

WHISPER_SAMPLING_RATE = 16000

def test_chatglm(llm_model, llm_tokenizer, report, is_report): with torch.inference_mode(): #warm up======================= prompt = '问：[INST]请简单地回答。 <>\n\n\n<>\n\n请介绍一下上海 [/INST]\n\n答：' # tokenize the input prompt from string to token ids; # with .to('xpu') specifically for inference on Intel GPUs input_ids = llm_tokenizer.encode(prompt, return_tensors="pt").to('xpu') st = time.time() # predict the next tokens (maximum 32) based on the input token ids output = llm_model.generate(input_ids, max_new_tokens=128) if is_report: report.append('llm infer {:.2f} s'.format(time.time()-st)) print(f'==============================llm generate {time.time()-st}') # decode the predicted token ids to output string

    st = time.time()
    output = output.cpu()
    output_str = llm_tokenizer.decode(output[0], skip_special_tokens=True)
    #output = output.cpu()
    print(f'==============================llm decode {time.time()-st}')
    print('-'*20, 'Output', '-'*20)
    print(output_str)

def test_sd(sd_model, report, is_report): imgidx = 1 save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png" prompt_en = "a pig flying in the sky" prompt_en2 = "a horse running in the city"

# output_image = sd_model(
#     prompt=prompt_en,
#     num_inference_steps=4,
#     guidance_scale=1,
#     num_images_per_prompt=1,
#     output_type="pil",
#     width=512,
#     height=512,
#     lcm_origin_steps=50,
# ).images[0]
# # torch.xpu.synchronize()

st = time.time()
output_image = sd_model(
    prompt=prompt_en2,
    num_inference_steps=4,
    guidance_scale=1,
    num_images_per_prompt=1,
    output_type="pil",
    width=512,
    height=512,
    lcm_origin_steps=50,
).images[0]

if is_report:
    report.append('sd  infer {:.2f} s'.format(time.time()-st))
print("========== sd generate {:.2f} s".format(time.time()-st)) #0.51s, Arc770, 16GB
output_image.save(save_path)
print("========== sd save {:.2f} s".format(time.time()-st)) #0.64s, Arc770, 16GB

def test_minicpm(model, tokenizer, report, is_report): query = "图片里有什么?他们在干什么？" stream = False image_path = '/home/rdl/qt_image_2/current.jpg' if os.path.exists(image_path): image = Image.open(image_path).convert('RGB') else: image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')

# Generate predicted tokens
# here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md
msgs = [{'role': 'user', 'content': [image, query]}]

# ipex_llm model needs a warmup, then inference time can be accurate
model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
)

if stream:
    res = model.chat(
        image=None,
        msgs=msgs,
        tokenizer=tokenizer,
        stream=True
    )

    print('-'*20, 'Input Image', '-'*20)
    print(image_path)
    print('-'*20, 'Input Prompt', '-'*20)
    print(query)
    print('-'*20, 'Stream Chat Output', '-'*20)
    for new_text in res:
        print(new_text, flush=True, end='')
else:
    st = time.time()
    res = model.chat(
        image=None,
        msgs=msgs,
        tokenizer=tokenizer,
    )
    torch.xpu.synchronize()
    end = time.time()
    if is_report:
        report.append('cpm infer {:.2f} s'.format(end-st))
    print(f'=========minicpm Inference time: {end-st} s')
    print('-'*20, 'Input Image', '-'*20)
    print(image_path)
    print('-'*20, 'Input Prompt', '-'*20)
    print(query)
    print('-'*20, 'Chat Output', '-'*20)
    print(res)

def test_whisper(whisper_processor, whisper_model, report, is_report): init_wav_file = os.path.join(os.path.dirname(file), "test_draw-pic-of-horse.wav") audio, __ = librosa.load(path=init_wav_file, sr=WHISPER_SAMPLING_RATE) # get_text_from_voice(audio, skip_print=True)

#https://github.com/huggingface/transformers/issues/22395, prompt for whisper
simple_zh = "以下是普通话的句子[\INST]" #简体字，否则会出现繁体字
prompt_ids = whisper_processor.get_prompt_ids(simple_zh)
forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="chinese", task="transcribe")
# Generate predicted tokens
output_str = None
with torch.inference_mode():
    if device == "GPU":
        input_features = whisper_processor(audio,
                            sampling_rate=WHISPER_SAMPLING_RATE,
                            return_tensors="pt").input_features.half().to('xpu')
    else:
        input_features = whisper_processor(audio,
                            sampling_rate=WHISPER_SAMPLING_RATE,
                            return_tensors="pt").input_features 
    st = time.time()
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly in the `generate` function
    # to obtain optimal performance with BigDL-LLM INT4 optimizations
    predicted_ids = whisper_model.generate(input_features,
                                forced_decoder_ids=forced_decoder_ids,
                                prompt_ids=prompt_ids)
    output_str = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
    # output_str = output_str[0]
    output_str = output_str[0].split('[\INST]')[-1]
inf_time = time.time()-st
# self._json_output[self._cycle_count] = {"Recognized question: ": output_str, "Inference and decoding time: ": inf_time}
# self._metric_report.append(f'== Whisper inference time for {output_str}: {inf_time} s')
if is_report:
    report.append('wsp infer {:.2f} s'.format(inf_time))
print('=============== whisper latency {:.2f} s'.format(inf_time))
print('-'*20, 'Recognized text', '-'*20)
print(f'{output_str}')
print('-'*40)
return output_str

if name == 'main': device = "GPU" llm_test = True sd_test = False minicpm_test = False wp_test = False metric_report = [] test_loop = 10

#############
#load whisper
#############
if wp_test:
    st = time.time()
    whisper_model_path = "/home/rdl/git/whisper-small"

    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    if device == "GPU":
        whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path,
                                                    load_in_4bit=True,
                                                    optimize_model=False,
                                                    use_cache=True)
        whisper_model.half().to('xpu')
    else:
        whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path,
                                                    load_in_4bit=True)
    whisper_model.config.forced_decoder_ids = None

    # Load processor
    whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
    print("============ whisper load to gpu")
    test_whisper(whisper_processor, whisper_model, metric_report, False)
    print("============ whisper warm up done!")

#############
#load chatglm
#############
if llm_test:
    llm_model_path = "/home/rdl/git/chatglm3-6b"
    save_directory = './open-chatglm3-3b-v2-ipex-llm-INT4'
    llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True)
    if os.path.exists(save_directory):
        llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True)
    else:
        llm_model = AutoModel.from_pretrained(llm_model_path,
                                    load_in_4bit=True,
                                    optimize_model=True,
                                    trust_remote_code=True,
                                    use_cache=True)
        llm_model.save_low_bit(save_directory)
        del(llm_model)
        llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True)

    if device == "GPU":
        llm_model.to('xpu')
    print("============ llm load to gpu")
    test_chatglm(llm_model, llm_tokenizer, metric_report, False)
    print("============ llm warm done!")

##########################
#load lcm stable diffusion
##########################
if sd_test:
    sd_model_path = "/home/rdl/git/LCM_Dreamshaper_v7"
    # autoencoder_model_path = "/home/rdl/git/LCM_Dreamshaper_v7/vae"
    # sd_model_ = DiffusionPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float32)
    # imgidx = 1
    # pipe.vae = AutoencoderTiny.from_pretrained("./LCM_Dreamshaper_v7/taesd", torch_dtype=torch.float16)
    #pipe = pipe.to("xpu")
    # save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png"
    # prompt_en = "a pig flying in the sky"
    # prompt_en2 = "a horse running in the city"

    sd_model = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path=sd_model_path,
                                                    torch_dtype=torch.float16)
    sd_model.to("xpu")
    print("============ sd load to gpu")
    test_sd(sd_model, metric_report, False)
    print("============ sd warm done!")

#############
#load minicpm
#############
if minicpm_test:
    minicpm_model_path = "/home/rdl/git/MiniCPM-V-2_6"
    # image_path = '/home/rdl/qt_image_2/current.jpg'
    minicpm_lowbit_path = "./open-MiniCPM-V-26-ipex-llm-INT4"

    if not minicpm_lowbit_path or not os.path.exists(minicpm_lowbit_path):
        # Load model in 4 bit,
        # which convert the relevant layers in the model into INT4 format
        # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
        # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
        minicpm_model = AutoModel.from_pretrained(minicpm_model_path, 
                                        load_in_low_bit="sym_int4",
                                        optimize_model=True,
                                        trust_remote_code=True,
                                        use_cache=True,
                                        modules_to_not_convert=["vpm", "resampler"])
        minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_model_path,
                                                trust_remote_code=True)
    else:
        minicpm_model = AutoModel.load_low_bit(minicpm_lowbit_path, 
                                    optimize_model=True,
                                    trust_remote_code=True,
                                    use_cache=True,
                                    modules_to_not_convert=["vpm", "resampler"])
        minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_lowbit_path,
                                                trust_remote_code=True)
    
    minicpm_model.eval()

    if minicpm_model_path and not os.path.exists(minicpm_model_path):
        minicpm_processor = AutoProcessor.from_pretrained(minicpm_model_path,
                                                trust_remote_code=True)
        minicpm_model.save_low_bit(minicpm_lowbit_path)
        minicpm_tokenizer.save_pretrained(minicpm_lowbit_path)
        minicpm_processor.save_pretrained(minicpm_lowbit_path)

    minicpm_model = minicpm_model.half().to('xpu')    
    print("============ cpm load to gpu")
    test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, False)
    print("============ cpm warm up done!")

#############
#test
#############
i = 0
while i<test_loop:
    i += 1
    print('\n', '=='*20, ' TEST STARTING ', '=='*20)
    metric_report.append('\n-----------------loop {}'.format(i))    
    if llm_test:
        print('\n=================test_chatglm')
        test_chatglm(llm_model, llm_tokenizer, metric_report, True)
    if minicpm_test:
        print('\n=================test_minicpm')
        test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, True)
    if sd_test:
        print('\n=================test_sd')
        test_sd(sd_model, metric_report,True)
    if wp_test:
        print('\n=================test_whisper')
        test_whisper(whisper_processor, whisper_model, metric_report, True)

for rp in metric_report:
    print(rp)

The code you provided has confusing indentation. Can you provide a formatted code?

qing-xu-intel · 2025-01-08T05:29:32Z

test.txt

please modify attached test.txt to test.py

glorysdj assigned ACupofAir Jan 7, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

ACupofAir commented Jan 8, 2025

qing-xu-intel commented Jan 8, 2025

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

Comments

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

ACupofAir commented Jan 8, 2025

qing-xu-intel commented Jan 8, 2025