Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Intel Gpu support initialization #4340

Merged
merged 8 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion extensions/multimodal/abstract_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import torch
from PIL import Image
from transformers import is_torch_xpu_available


class AbstractMultimodalPipeline(ABC):
Expand Down Expand Up @@ -55,7 +56,7 @@ def placeholder_embeddings() -> torch.Tensor:

def _get_device(self, setting_name: str, params: dict):
if params[setting_name] is None:
return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
return torch.device("cuda:0" if torch.cuda.is_available() else "xpu:0" if is_torch_xpu_available() else "cpu")
return torch.device(params[setting_name])

def _get_dtype(self, setting_name: str, params: dict):
Expand Down
3 changes: 2 additions & 1 deletion modules/AutoGPTQ_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

from accelerate import is_xpu_available
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

import modules.shared as shared
Expand Down Expand Up @@ -41,7 +42,7 @@ def load_quantized(model_name):
# Define the params for AutoGPTQForCausalLM.from_quantized
params = {
'model_basename': pt_path.stem,
'device': "cuda:0" if not shared.args.cpu else "cpu",
'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
'use_triton': shared.args.triton,
'inject_fused_attention': not shared.args.no_inject_fused_attention,
'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
Expand Down
15 changes: 9 additions & 6 deletions modules/GPTQ_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import accelerate
import torch
import transformers
from accelerate import is_xpu_available
from gptq_for_llama import llama_inference_offload
from gptq_for_llama.modelutils import find_layers
from gptq_for_llama.quant import make_quant
from transformers import AutoConfig, AutoModelForCausalLM

import modules.shared as shared
from modules.logging_colors import logger

from gptq_for_llama import llama_inference_offload
from gptq_for_llama.modelutils import find_layers
from gptq_for_llama.quant import make_quant


# This function is a replacement for the load_quant function in the
# GPTQ-for_LLaMa repository. It supports more models and branches.
Expand Down Expand Up @@ -144,7 +144,7 @@ def load_quantized(model_name):
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)

# accelerate offload (doesn't work properly)
if shared.args.gpu_memory or torch.cuda.device_count() > 1:
if shared.args.gpu_memory or torch.cuda.device_count() > 1 or (is_xpu_available() and torch.xpu.device_count() > 1):
if shared.args.gpu_memory:
memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
Expand All @@ -163,6 +163,9 @@ def load_quantized(model_name):

# No offload
elif not shared.args.cpu:
model = model.to(torch.device('cuda:0'))
if is_xpu_available():
model = model.to(torch.device("xpu:0"))
else:
model = model.to(torch.device('cuda:0'))

return model
4 changes: 4 additions & 0 deletions modules/LoRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import torch
from peft import PeftModel
from transformers import is_torch_xpu_available

import modules.shared as shared
from modules.logging_colors import logger
Expand Down Expand Up @@ -179,6 +180,9 @@ def add_lora_transformers(lora_names):
if torch.backends.mps.is_available():
device = torch.device('mps')
shared.model = shared.model.to(device)
elif is_torch_xpu_available():
device = torch.device("xpu:0")
shared.model = shared.model.to(device)
else:
shared.model = shared.model.cuda()

Expand Down
3 changes: 2 additions & 1 deletion modules/RWKV.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import numpy as np
from tokenizers import Tokenizer
from transformers import is_torch_xpu_available

import modules.shared as shared
from modules.callbacks import Iteratorize
Expand All @@ -27,7 +28,7 @@ def __init__(self):
pass

@classmethod
def from_pretrained(self, path, dtype="fp16", device="cuda"):
def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"):
tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
if shared.args.rwkv_strategy is None:
model = RWKV(model=str(path), strategy=f'{device} {dtype}')
Expand Down
6 changes: 5 additions & 1 deletion modules/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import torch
import transformers
from transformers import is_torch_xpu_available

import modules.shared as shared

Expand Down Expand Up @@ -92,4 +93,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
torch.cuda.empty_cache()
if is_torch_xpu_available():
torch.xpu.empty_cache()
else:
torch.cuda.empty_cache()
11 changes: 9 additions & 2 deletions modules/logits.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import torch
from transformers import is_torch_xpu_available

from modules import sampler_hijack, shared
from modules.logging_colors import logger
Expand Down Expand Up @@ -32,13 +33,19 @@ def get_next_logits(prompt, state, use_samplers, previous):
scores = sampler_hijack.global_scores[-1]
else:
if is_non_hf_exllamav2 or is_non_hf_exllamav1:
tokens = shared.tokenizer.encode(prompt).cuda()
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt).to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt).cuda()
scores = shared.model.get_logits(tokens)[-1][-1]
elif is_non_hf_llamacpp:
tokens = shared.tokenizer.encode(prompt)
scores = shared.model.get_logits(tokens)[-1][-1]
else:
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
output = shared.model(input_ids=tokens)
scores = output['logits'][-1][-1]

Expand Down
50 changes: 33 additions & 17 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@

import torch
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate import (
infer_auto_device_map,
init_empty_weights,
is_ccl_available,
is_xpu_available
)
from transformers import (
AutoConfig,
AutoModel,
Expand Down Expand Up @@ -38,8 +43,12 @@
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration

Expand Down Expand Up @@ -137,8 +146,9 @@ def huggingface_loader(model_name):
if torch.backends.mps.is_available():
device = torch.device('mps')
model = model.to(device)
elif hasattr(torch, 'xpu') and torch.xpu.is_available():
model = model.to('xpu')
elif is_xpu_available():
device = torch.device("xpu")
model = model.to(device)
else:
model = model.cuda()

Expand All @@ -151,15 +161,10 @@ def huggingface_loader(model_name):

# Load with quantization and/or offloading
else:
conditions = [
shared.args.cpu,
torch.cuda.is_available(),
torch.backends.mps.is_available(),
hasattr(torch, 'xpu') and torch.xpu.is_available(),
]

if not any(conditions):
logger.warning('No GPU has been detected by Pytorch. Falling back to CPU mode.')

if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')

shared.args.cpu = True

if shared.args.cpu:
Expand Down Expand Up @@ -362,7 +367,12 @@ def RWKV_loader(model_name):
'''
from modules.RWKV import RWKVModel, RWKVTokenizer

model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
model = RWKVModel.from_pretrained(
Path(f'{shared.args.model_dir}/{model_name}'),
dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16",
device="cpu" if shared.args.cpu else "xpu" if is_xpu_available() else "cuda"
)

tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir))
return model, tokenizer

Expand All @@ -380,7 +390,10 @@ def get_max_memory_dict():
# If --auto-devices is provided standalone, try to get a reasonable value
# for the maximum memory of device :0
elif shared.args.auto_devices:
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
if is_xpu_available():
total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
else:
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
suggestion = round((total_mem - 1000) / 1000) * 1000
if total_mem - suggestion < 800:
suggestion -= 1000
Expand All @@ -395,7 +408,10 @@ def get_max_memory_dict():
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
torch.cuda.empty_cache()
if is_xpu_available():
torch.xpu.empty_cache()
else:
torch.cuda.empty_cache()


def unload_model():
Expand Down
11 changes: 7 additions & 4 deletions modules/sampler_hijack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import torch
import transformers
from transformers import LogitsWarper
from transformers import LogitsWarper, is_torch_xpu_available
from transformers.generation.logits_process import (
LogitNormalization,
LogitsProcessor,
Expand Down Expand Up @@ -106,9 +106,12 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
break

# Normalize the probabilities of the remaining words
prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')

prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
if is_torch_xpu_available():
prob_topk = torch.softmax(sorted_logits, dim=0).to("xpu")
prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to("xpu")
else:
prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')

observed_surprise = -math.log2(prob_topk[prev_i])
self.e = observed_surprise - self.mirostat_tau
Expand Down
9 changes: 5 additions & 4 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import torch
import transformers
from transformers import LogitsProcessorList
from transformers import LogitsProcessorList, is_torch_xpu_available

import modules.shared as shared
from modules.callbacks import (
Expand Down Expand Up @@ -132,8 +132,8 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
elif torch.backends.mps.is_available():
device = torch.device('mps')
return input_ids.to(device)
elif hasattr(torch, 'xpu') and torch.xpu.is_available():
return input_ids.to('xpu')
elif is_torch_xpu_available():
return input_ids.to("xpu:0")
else:
return input_ids.cuda()

Expand Down Expand Up @@ -238,7 +238,8 @@ def set_manual_seed(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)

elif is_torch_xpu_available():
torch.xpu.manual_seed_all(seed)
return seed


Expand Down
2 changes: 2 additions & 0 deletions modules/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
from peft.utils.other import \
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
from transformers import is_torch_xpu_available
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
Expand Down Expand Up @@ -626,6 +627,7 @@ def on_log(self, args: transformers.TrainingArguments, state: transformers.Train
# TODO: Enable multi-device support
ddp_find_unused_parameters=None,
no_cuda=shared.args.cpu,
use_ipex=True if is_torch_xpu_available and not shared.args.cpu else False
),
data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
callbacks=list([Callbacks()])
Expand Down
11 changes: 7 additions & 4 deletions modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import gradio as gr
import torch
import yaml
from transformers import is_torch_xpu_available

from modules import shared


with open(Path(__file__).resolve().parent / '../css/NotoSans/stylesheet.css', 'r') as f:
css = f.read()
with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f:
Expand Down Expand Up @@ -85,9 +85,12 @@ def list_model_elements():
'rope_freq_base',
'numa',
]

for i in range(torch.cuda.device_count()):
elements.append(f'gpu_memory_{i}')
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
elements.append(f'gpu_memory_{i}')
else:
for i in range(torch.cuda.device_count()):
elements.append(f'gpu_memory_{i}')

return elements

Expand Down
9 changes: 7 additions & 2 deletions modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import gradio as gr
import psutil
import torch
from transformers import is_torch_xpu_available

from modules import loaders, shared, ui, utils
from modules.logging_colors import logger
Expand All @@ -27,8 +28,12 @@ def create_ui():

# Finding the default values for the GPU and CPU memories
total_mem = []
for i in range(torch.cuda.device_count()):
total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
else:
for i in range(torch.cuda.device_count()):
total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))

default_gpu_mem = []
if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
Expand Down
13 changes: 13 additions & 0 deletions one_click.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ def cpu_has_avx2():
return True


def cpu_has_amx():
try:
import cpuinfo

info = cpuinfo.get_cpu_info()
if 'amx' in info['flags']:
return True
else:
return False
except:
return True


def torch_version():
site_packages_path = None
for sitedir in site.getsitepackages():
Expand Down