Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gptq is soooo lazy #424

Merged
merged 10 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 193 additions & 56 deletions modeling/inference_models/gptq_hf_torch/class.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
import shutil
import sys
from typing import Union
from typing import Dict, Union

import utils
import modeling.lazy_loader as lazy_loader
Expand Down Expand Up @@ -82,13 +82,83 @@ def get_gptq_version(fpath):
logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
return 0, False

def load_quant_offload_device_map(
load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False,
):
from gptq.offload import (
find_layers,
llama_offload_forward,
gptneox_offload_forward,
gptj_offload_forward,
opt_offload_forward,
bigcode_offload_forward
)
from transformers.models.llama.modeling_llama import LlamaModel
from transformers.models.opt.modeling_opt import OPTModel
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel
from transformers.models.gptj.modeling_gptj import GPTJModel
from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)

m, layers, remaining = find_layers(model)
type(m).non_offload_forward = type(m).forward

# Hook offload_forward into found model
if type(m) == LlamaModel:
type(m).forward = llama_offload_forward
elif type(m) == GPTNeoXModel:
type(m).forward = gptneox_offload_forward
elif type(m) == GPTJModel:
type(m).forward = gptj_offload_forward
elif type(m) == OPTModel:
type(m).forward = opt_offload_forward
elif type(m) == GPTBigCodeModel:
type(m).forward = bigcode_offload_forward
else:
raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")

layers_done = len([1 for v in device_map.values() if v != "cpu"])

m.cpu_device = torch.device("cpu")
m.fast_offload = layers_done > len(layers) // 2
m.layer_count = len(layers)
m.cpu_layers = len(layers) - layers_done
m.gpu_layers = layers_done
m.offload_type = offload_type
# HACK
m.primary_gpu = list(device_map.values())[0]

if "layers" not in dir(m):
m.layers = layers

for i in range(len(layers)):
dev = None
for key, device in device_map.items():
key = int(*[x for x in key.split(".") if x.isdecimal()])
if key == i:
dev = device
break
if dev is None:
raise ValueError
layers[key].to(dev, torch.float16, False)

for module in remaining:
module.to(m.primary_gpu)

return model


class model_backend(HFTorchInferenceModel):
def is_valid(self, model_name, model_path, menu_path):
gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
return bool(gptq_model)

def _load(self, save_model: bool, initial_load: bool) -> None:
try:
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM

# Make model path the same as the model name to make this consistent
# with the other loading method if it isn't a known model type. This
# code is not just a workaround for below, it is also used to make the
Expand All @@ -98,7 +168,7 @@ def _load(self, save_model: bool, initial_load: bool) -> None:

self.init_model_config()

self.lazy_load = False
self.lazy_load = True

gpulayers = self.breakmodel_config.gpu_blocks

Expand All @@ -107,10 +177,6 @@ def _load(self, save_model: bool, initial_load: bool) -> None:
except (ValueError, AttributeError):
self.gpu_layers_list = [utils.num_layers(self.model_config)]

tf_kwargs = {
"low_cpu_mem_usage": True,
}

# If we're using torch_lazy_loader, we need to get breakmodel config
# early so that it knows where to load the individual model tensors
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
Expand All @@ -123,9 +189,6 @@ def _load(self, save_model: bool, initial_load: bool) -> None:
self.breakmodel_device_config(self.model_config)

if self.lazy_load:
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
tf_kwargs.pop("low_cpu_mem_usage", None)

# If we're using lazy loader, we need to figure out what the model's hidden layers are called
with lazy_loader.use_lazy_load(dematerialized_modules=True):
try:
Expand All @@ -141,7 +204,7 @@ def _load(self, save_model: bool, initial_load: bool) -> None:

if self.get_local_model_path():
# Model is stored locally, load it.
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
self.model = self._get_model(self.get_local_model_path())
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
else:
raise NotImplementedError("GPTQ Model downloading not implemented")
Expand All @@ -161,15 +224,71 @@ def _load(self, save_model: bool, initial_load: bool) -> None:
self.model.kai_model = self
utils.koboldai_vars.modeldim = self.get_hidden_size()

def _get_model(self, location: str, tf_kwargs: Dict):
def _patch_quant(self, device_map, quant_module) -> None:
def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs):
if isinstance(module, quant_module.QuantLinear):
return

for attr in dir(module):
tmp = getattr(module, attr)
name1 = name + '.' + attr if name != '' else attr
if name1 in names:
parts = name1.split(".")
device = None
for i in reversed(range(len(parts))):
maybe_key = ".".join(parts[:i])
if maybe_key in device_map:
device = device_map[maybe_key]
break

if device is None:
raise ValueError(f"No device for {name1}")

delattr(module, attr)

ql = quant_module.QuantLinear(
bits,
groupsize,
tmp.in_features,
tmp.out_features,
force_bias or tmp.bias is not None,
**kwargs,
)
ql = ql.to(device)

setattr(module, attr, ql)

for name1, child in module.named_children():
make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)

quant_module.make_quant = make_quant


def _patch_quants(self, device_map) -> None:
# Load QuantLinears on the device corresponding to the device map

from gptq import quant_v3
from gptq import quant_v2
from gptq import quant_v1

for quant_module in [quant_v3, quant_v2, quant_v1]:
self._patch_quant(device_map, quant_module)


def _get_model(self, location: str):
import gptq
from gptq.gptj import load_quant as gptj_load_quant
from gptq.gptneox import load_quant as gptneox_load_quant
from gptq.llama import load_quant as llama_load_quant
from gptq.opt import load_quant as opt_load_quant
from gptq.bigcode import load_quant as bigcode_load_quant
from gptq.mpt import load_quant as mpt_load_quant
from gptq.offload import load_quant_offload

try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM

gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
v2_bias = False
Expand All @@ -181,50 +300,68 @@ def _get_model(self, location: str, tf_kwargs: Dict):
model_type = self.get_model_type()

logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
if model_type == "gptj":
model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_neox":
model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "llama":
model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "opt":
model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "mpt":
model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_bigcode":
model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
else:
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")

try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM

# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))

# Patch in embeddings function
def get_input_embeddings(self):
return self.model.get_input_embeddings()

type(model).get_input_embeddings = get_input_embeddings

# Patch in args support..
def generate(self, *args, **kwargs):
"""shortcut for model.generate"""
with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
return self.model.generate(*args, **kwargs)

type(model).generate = generate

device_map = {}

if self.lazy_load:
with lazy_loader.use_lazy_load(dematerialized_modules=True):
metamodel = AutoModelForCausalLM.from_config(self.model_config)
if utils.args.cpu:
device_map = {name: "cpu" for name in utils.layers_module_names}
for name in utils.get_missing_module_names(
metamodel, list(device_map.keys())
):
device_map[name] = "cpu"
else:
device_map = self.breakmodel_config.get_device_map(
metamodel
)

self._patch_quants(device_map)

with lazy_loader.use_lazy_load(
enable=self.lazy_load,
dematerialized_modules=False,
):
if model_type == "gptj":
model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "gpt_neox":
model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "llama":
model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "opt":
model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "mpt":
model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
elif model_type == "gpt_bigcode":
model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half()
else:
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")

# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM

model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))

# Patch in embeddings function
def get_input_embeddings(self):
return self.model.get_input_embeddings()

type(model).get_input_embeddings = get_input_embeddings

# Patch in args support..
def generate(self, *args, **kwargs):
"""shortcut for model.generate"""
with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
return self.model.generate(*args, **kwargs)

type(model).generate = generate

return model

Expand Down
7 changes: 2 additions & 5 deletions modeling/lazy_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,6 @@ def materialize(
CheckpointChunkCache.key = self.key
ziproot = checkpoint.namelist()[0].split("/")[0]
CheckpointChunkCache.handle = checkpoint.open(f"{ziproot}/data/{self.key}", "r")



else:
# Cache hit. Hip hip hooray! :^)
# print(".", end="", flush=True)
Expand Down Expand Up @@ -318,7 +315,6 @@ class _LazyUnpickler(RestrictedUnpickler):
lazy_loaded_storages: Dict[str, LazyTensor]

def __init__(self, *args, **kwargs):
# print(args, kwargs)
self.lazy_loaded_storages = {}
return super().__init__(*args, **kwargs)

Expand Down Expand Up @@ -376,7 +372,7 @@ def safetensors_load(checkpoint_file: str) -> dict:
# (70 tensors/s -> 65 tensor/s). The memory savings probably
# shouldn't be the happening, maybe there's a memory leak
# somewhere in our pipeline with CPU tensors.
intermediary_device = "cuda"
intermediary_device = "cuda:0"
else:
intermediary_device = "cpu"

Expand Down Expand Up @@ -409,6 +405,7 @@ def safetensors_load(checkpoint_file: str) -> dict:
return tensors

transformers.modeling_utils.safe_load_file = safetensors_load
safetensors.torch.load_file = safetensors_load


@contextlib.contextmanager
Expand Down
18 changes: 18 additions & 0 deletions modeling/patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,33 @@ def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int):


class LazyloadPatches:
class StateDictFacade(dict):
def __init__(self, state_dict):
self.update(state_dict)

def __getitem__(self, name):
return super().__getitem__(name).materialize(map_location="cuda:0")

old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model
torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict

def __enter__() -> None:
transformers.modeling_utils._load_state_dict_into_meta_model = (
LazyloadPatches._load_state_dict_into_meta_model
)
torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict

def __exit__(exc_type, exc_value, exc_traceback) -> None:
transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict
torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict

def _torch_load_from_state_dict(self, state_dict, *args, **kwargs):
return LazyloadPatches.torch_old_load_from_state_dict(
self,
LazyloadPatches.StateDictFacade(state_dict),
*args,
**kwargs
)

def _load_state_dict_into_meta_model(
model,
Expand Down