From d2aa176a082d61f859866281e0cd1db72a61e158 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 7 May 2024 01:49:27 -0700 Subject: [PATCH 01/28] adapt smoothquant,static,woq autooround Signed-off-by: changwangss --- .../transformers/llm/quantization/utils.py | 194 ++++++++---------- .../transformers/modeling/modeling_auto.py | 73 +++---- 2 files changed, 126 insertions(+), 141 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index bbf38d7fdd7..2513ce4e5c6 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -26,7 +26,20 @@ from neural_compressor import quantization from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear from neural_compressor.utils.utility import LazyImport -from neural_compressor.config import PostTrainingQuantConfig +from neural_compressor.torch.algorithms.weight_only.autoround import get_autoround_default_run_fn +from neural_compressor.torch.quantization import ( + AutoRoundConfig, + RTNConfig, + GPTQConfig, + AWQConfig, + TEQConfig, + StaticQuantConfig, + SmoothQuantConfig, + HQQConfig, + convert, + get_default_AutoRound_config, + prepare +) from intel_extension_for_transformers.tools.utils import ( is_ipex_available, is_autoround_available, @@ -334,8 +347,6 @@ def convert_to_quantized_model(model, config, device="cpu"): assert ( hasattr(torch, "xpu") and torch.xpu.is_available() ), "There is no xpu device in this system!" - calib_dataloader = config.calib_dataloader - calib_func = config.calib_func calib_iters = config.calib_iters calib_dataset = config.dataset model_device = next(model.parameters()).device @@ -406,20 +417,13 @@ def collate_batch_for_autoround(batch): return torch.vstack(input_ids_padded) - if config.quant_method.value == "autoround": - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=8, - shuffle=False, - collate_fn=collate_batch_for_autoround, - ) - else: - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) + + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) if calib_func is None and config.quant_method.value == "awq": def default_calib_func(model): @@ -438,6 +442,12 @@ def default_calib_func(model): + "the calibration dataset is NeelNanda/pile-10k," + "batchsize is 1 and calibration iteration is 100." ) + orig_dtype = torch.float32 + for param in model.parameters(): + orig_dtype = param.dtype + if orig_dtype != torch.float32: + model.to(dtype=torch.float32) + break if config.weight_dtype in ["fp8_e4m3", "fp8_e5m2"]: return replace_linear(model, None, None, config, device=device) else: @@ -450,102 +460,74 @@ def default_calib_func(model): dtype = config.weight_dtype # mapping to INC config if config.quant_method.value == "rtn": - recipes = { - "layer_wise_quant": config.layer_wise, - "rtn_args": { - "enable_full_range": ( - True if "fullrange" in config.weight_dtype else False - ), - "enable_mse_search": config.mse_range, - }, - } - algorithm = "RTN" + quant_config = RTNConfig( + + ) + elif config.quant_method.value == "hqq": + quant_config = HQQConfig( + + ) elif config.quant_method.value == "awq": - recipes = { - "rtn_args": { - "enable_full_range": ( - True if "fullrange" in config.weight_dtype else False - ), - "enable_mse_search": config.mse_range, - }, - "awq_args": {"folding": True}, - } - algorithm = "AWQ" + quant_config = AWQConfig( + + ) elif config.quant_method.value == "teq": - recipes = {"teq_args": {}} - algorithm = "TEQ" + quant_config = TEQConfig( + + ) elif config.quant_method.value == "gptq": - recipes = { - "layer_wise_quant": config.layer_wise, - "gptq_args": { - "act_order": config.desc_act, - "percdamp": config.damp_percent, - "block_size": config.blocksize, - "nsamples": config.nsamples, - "use_max_length": True if config.max_input_length else False, - "pad_max_length": config.max_input_length, - "static_groups": config.static_groups, - }, - } - algorithm = "GPTQ" + quant_config = GPTQConfig( + + ) elif config.quant_method.value == "autoround": - recipes = { - "autoround_args": { - "n_samples": config.nsamples, - "seq_len": config.calib_len, - "iters": config.calib_iters, - "scale_dtype": config.scale_dtype, - "use_quant_input": config.use_quant_input, - "lr": config.lr, - "minmax_lr": config.minmax_lr, - } - } - algorithm = "AUTOROUND" + quant_config = AutoRoundConfig( + dtype=config.dtype, + bits=config.bits, + use_sym=config.use_sym, + group_size=config.group_size, + enable_full_range=config.enable_full_range, + batch_size=config.batch_size, + lr_scheduler=config.lr_scheduler, + use_quant_input=config.use_quant_input, + enable_minmax_tuning=config.enable_minmax_tuning, + lr=config.lr, + minmax_lr=config.minmax_lr, + low_gpu_mem_usage=config.low_gpu_mem_usage, + iters=config.iters, + seqlen=config.seq_len, + n_samples=config.n_samples, + sampler=config.sampler, + seed=config.seed, + n_blocks=config.n_blocks, + gradient_accumulate_steps=config.gradient_accumulate_steps, + not_use_best_mse=config.not_use_best_mse, + dynamic_max_gap=config.dynamic_max_gap, + scale_dtype=config.scale_dtype, + white_list=config.white_list, + ) + quant_config.set_local(".*lm_head", AutoRoundConfig(w_dtype="fp32", act_dtype="fp32")) + quant_config.set_local(".*output_layer", AutoRoundConfig(w_dtype="fp32", act_dtype="fp32")) + quant_config.set_local(".*embed_out", AutoRoundConfig(w_dtype="fp32", act_dtype="fp32")) + logger.info(f"Do AutoRound with config {quant_config}") + + run_fn = get_autoround_default_run_fn + run_args = ( + tokenizer, + dataset, + quant_config.n_samples, + quant_config.seq_len, + quant_config.seed, + quant_config.batch_size, + "train" + ) + inc_model = prepare(model=model, quant_config=quant_config) + run_fn(model, *run_args) + inc_model = convert(inc_model) + inc_model.eval() + else: assert False, "The Supported algorithm are RTN, AWQ, TEQ, GPTQ, AUTOROUND" - conf = PostTrainingQuantConfig( - approach="weight_only", - op_type_dict={ - ".*": { - "weight": { - "bits": bits, - "dtype": dtype, - "group_size": config.group_size, # -1 (per-channel) - "scheme": config.scheme, - "algorithm": algorithm, - }, - }, - }, - op_name_dict={ - ".*lm_head": { # re.match - "weight": {"dtype": "fp32"}, - }, - ".*output_layer": { # re.match - "weight": {"dtype": "fp32"}, - }, - ".*embed_out": { # re.match - "weight": {"dtype": "fp32"}, - }, - }, - recipes=recipes, - ) - # TEQ: set calib_func=None, use default training func as calib_func - # RTN: doesn't need calib_func - if config.quant_method.value not in ["awq"]: - calib_func = None - - orig_dtype = torch.float32 - for param in model.parameters(): - orig_dtype = param.dtype - if orig_dtype != torch.float32: - model.to(dtype=torch.float32) - break - inc_model = quantization.fit( - model, conf, calib_func=calib_func, calib_dataloader=calib_dataloader - ) - inc_model.eval() - if device == "xpu" or device == torch.device("xpu"): model = inc_model.export_compressed_model( compression_dtype=torch.int8, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index ede4c684427..3f5332d0fdb 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -951,28 +951,31 @@ def calib_func(model): } break - # call inc sq - from neural_compressor import PostTrainingQuantConfig, quantization - - conf = PostTrainingQuantConfig( - backend=quantization_config.backend, # default is ipex - excluded_precisions=quantization_config.excluded_precisions, - op_type_dict=quantization_config.op_type_dict, - op_name_dict=quantization_config.op_name_dict, - recipes=quantization_config.recipes, - example_inputs=example_inputs, - ) - model = quantization.fit( - model, - conf, - calib_func=calib_func, - calib_dataloader=( - calib_dataloader - if quantization_config.recipes["smooth_quant_args"]["alpha"] - == "auto" - else None - ), + # call inc smoothquant + from neural_compressor.torch.quantization import SmoothQuantConfig, quantize + quant_config = SmoothQuantConfig( + w_dtype=quantization_config.w_dtype, + w_sym=quantization_config.w_sym, + w_granularity=quantization_config.w_granularity, + w_algo=quantization_config.w_algo, + act_dtype=quantization_config.act_dtype, + alpha=quantization_config.alpha, + folding=quantization_config.folding, + scale_sharing=quantization_config.scale_sharing, + init_alpha=quantization_config.init_alpha, + alpha_min=quantization_config.alpha_min, + alpha_step=quantizate_config.alpha_step, + shared_criterion=quantization_config.shared_criterion, + do_blockwise=quantizate_config.do_blockwise, + auto_alpha_args=quantizate_config.auto_alpha_args, + white_list=quantizate_config.white_list, ) + + model = quantize(model, + quant_config=quant_config, + run_fn=run_fn, + example_inputs=example_inputs + ) logger.info("SmoothQuant done.") elif isinstance(quantization_config, DynamicQuantConfig): model = cls.ORIG_MODEL.from_pretrained( @@ -1142,20 +1145,21 @@ def calib_func(model): # call inc static quant - from neural_compressor import PostTrainingQuantConfig, quantization - - conf = PostTrainingQuantConfig( - backend=quantization_config.backend, - excluded_precisions=quantization_config.excluded_precisions, - op_type_dict=quantization_config.op_type_dict, - op_name_dict=quantization_config.op_name_dict, - example_inputs=quantization_config.example_inputs, - ) - model = quantization.fit( - model, - conf, - calib_func=calib_func, + from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare + quant_config = StaticQuantConfig( + w_dtype=quantization_config.w_dtype, + w_sym=quantization_config.w_sym, + w_granularity=quantization_config.w_granularity, + w_algo=quantization_config.w_algo, + act_dtype=quantization_config.act_dtype, + act_sym=quantization_config.act_sym, + act_granularity=quantization_config.act_granularity, + act_algo=quantization_config.act_algo, + white_list=quantizate_config.white_list, ) + prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) + calib_func(prepared_model) + q_model = convert(prepared_model) model.save_pretrained = types.MethodType(save_low_bit, model) quantization_config.remove_redundant_parameters() model.quantization_config = quantization_config @@ -1292,7 +1296,6 @@ def train_func(model): ) train_func = train_func - # call inc static quant from neural_compressor import QuantizationAwareTrainingConfig, quantization from neural_compressor.training import prepare_compression From e804b7bcfc4d2dd2dd5d675c41d820a403c999af Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 16 May 2024 03:49:40 -0700 Subject: [PATCH 02/28] add GPTQ API Signed-off-by: changwangss --- .../llm/quantization/nn/modules.py | 1 + .../transformers/llm/quantization/utils.py | 263 +++++++++--------- .../transformers/utils/config.py | 7 +- 3 files changed, 130 insertions(+), 141 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py index 379a227e812..38586062804 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py @@ -297,6 +297,7 @@ def recover_idx(ret_idx, k, blocksize): for i in range(value_range): for j in range(blocksize): g_idx[ret_idx[i * blocksize + j]] = i + print(ret_idx[i * blocksize + j]) return g_idx def recover_int_weight(g_idx, int_weight): diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index aed5e10e7ef..cf2733167c1 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -24,7 +24,8 @@ from accelerate import init_empty_weights from datasets import load_dataset from neural_compressor import quantization -from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear +#from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.utils.utility import LazyImport from neural_compressor.torch.algorithms.weight_only.autoround import get_autoround_default_run_fn from neural_compressor.torch.quantization import ( @@ -110,7 +111,7 @@ def unpack_weight(qweight, scales, qzeros, q_config): # change it to int8 with offset 128 if not sym: weight = (weight.to(torch.int32) - 128).to(torch.int8) - return weight, scales, zeros + return weight.contiguous(), scales.contiguous(), zeros.contiguous() def replace_linear( @@ -345,7 +346,6 @@ def _replace_linear( quantization_config, ) int_weight = int_weight.view(-1, int_weight.shape[-1]) - model._modules[name].set_weights_bias( int_weight, scales, @@ -387,6 +387,82 @@ def _replace_linear( return model, is_replaced +def default_run_fn(model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="GPTQ"): + from datasets import load_dataset + from torch.utils.data import DataLoader + + if isinstance(dataset, (str, bytes, os.PathLike)): + calib_dataset = load_dataset(dataset, split="train") + calib_dataset = calib_dataset.shuffle(seed=42) + if tokenizer is None: + logger.error( + "Please provide the tokenizer in quantization_config." + ) + exit(0) + + def tokenize_function(examples): + if algo == "teq": + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if "prompt" in examples: + if algo == "teq": + example = tokenizer(examples["prompt"], padding="max_length", max_length=max_length) + else: + example = tokenizer(examples["prompt"]) + elif "code" in examples: + if algo == "teq": + example = tokenizer(examples["code"], padding="max_length", max_length=max_length) + else: + example = tokenizer(examples["code"]) + elif "text" in examples: + if algo == "teq": + example = tokenizer(examples["text"], padding="max_length", max_length=max_length) + else: + example = tokenizer(examples["text"]) + else: + logger.error( + "Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset." + ) + exit(0) + return example + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + + def collate_batch(batch): + input_ids_padded = [] + for text in batch: + input_ids = text["input_ids"] + if len(input_ids) >= max_length: + input_ids = input_ids[:max_length] + else: + continue + input_ids_padded.append(input_ids) + + return torch.vstack(input_ids_padded) + + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=collate_batch, + ) + total_cnt = 0 + for i, (input_ids) in enumerate(calib_dataloader): + if total_cnt + input_ids.shape[0] > n_samples: + input_ids = input_ids[: n_samples - total_cnt, ...] + total_cnt += input_ids.shape[0] + if total_cnt >= n_samples: + break + + try: + model( + input_ids=input_ids, + ) + except ValueError: + pass + def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): import intel_extension_for_pytorch @@ -394,101 +470,8 @@ def convert_to_quantized_model(model, config, device="cpu"): assert ( hasattr(torch, "xpu") and torch.xpu.is_available() ), "There is no xpu device in this system!" - calib_iters = config.calib_iters - calib_dataset = config.dataset - model_device = next(model.parameters()).device - - if ( - calib_dataloader is None - and config.quant_method.value not in ["rtn"] - and calib_dataset is not None - ): - from datasets import load_dataset - from torch.utils.data import DataLoader - - if isinstance(calib_dataset, (str, bytes, os.PathLike)): - calib_dataset = load_dataset(calib_dataset, split="train") - calib_dataset = calib_dataset.shuffle(seed=42) - if config.tokenizer is None: - logger.error( - "Please provide the tokenizer or provide calib_func directly," - + " the following is how to get tokenizer. \n" - + " from transformer import AutoTokenizer \n" - + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - def tokenize_function(examples): - if "prompt" in examples: - example = config.tokenizer(examples["prompt"]) - elif "code" in examples: - example = config.tokenizer(examples["code"]) - elif "text" in examples: - example = config.tokenizer(examples["text"]) - else: - logger.error( - "Please check dataset prompt identifier," - + " NeelNanda/pile-10k is default used calibration dataset." - ) - exit(0) - return example - - tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - - def collate_batch(batch): - input_ids_padded = [] - for text in batch: - input_ids = text["input_ids"] - input_ids = ( - input_ids[:512] - if (len(input_ids) > 512 and config.quant_method.value != "gptq") - else input_ids - ) - input_ids_padded.append(input_ids) - return torch.vstack(input_ids_padded) - - def collate_batch_for_autoround(batch): - input_ids_padded = [] - for text in batch: - input_ids = text["input_ids"] - if input_ids.shape[0] < config.calib_len: - continue - input_ids = input_ids[: config.calib_len] - input_ids_list = input_ids.tolist() - if input_ids_list.count(input_ids_list[-1]) > config.calib_len // 2: - continue - input_ids_padded.append(input_ids) - if len(input_ids_padded) == 0: - return None - - return torch.vstack(input_ids_padded) - - - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) - if calib_func is None and config.quant_method.value == "awq": - - def default_calib_func(model): - """This is the default calibration function, the dataset is NeelNanda/pile-10k, - the default calib_iters is 100.""" - for i, (input_ids) in enumerate(calib_dataloader): - if i >= calib_iters: - break - model( - input_ids=input_ids, - ) - - calib_func = default_calib_func - logger.info( - "The default calibration function is used, " - + "the calibration dataset is NeelNanda/pile-10k," - + "batchsize is 1 and calibration iteration is 100." - ) + model_device = next(model.parameters()).device orig_dtype = torch.float32 for param in model.parameters(): orig_dtype = param.dtype @@ -520,63 +503,74 @@ def default_calib_func(model): ) elif config.quant_method.value == "teq": quant_config = TEQConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, ) elif config.quant_method.value == "gptq": quant_config = GPTQConfig( - + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + act_order=config.desc_act, + percdamp=config.damp_percent, + block_size=config.blocksize, + static_groups=config.static_groups, + ) + quant_config.set_local(".*lm_head", GPTQConfig(dtype="fp32")) + quant_config.set_local(".*output_layer", GPTQConfig(dtype="fp32")) + quant_config.set_local(".*embed_out", GPTQConfig(dtype="fp32")) + logger.info(f"Do GPTQ with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.max_input_length, # max_length + config.nsamples, # n_samples + config.batch_size, # batch_size + config.quant_method.value # algo ) + model = prepare(model=model, quant_config=quant_config) + run_fn(model, *run_args) + model = convert(model) elif config.quant_method.value == "autoround": quant_config = AutoRoundConfig( - dtype=config.dtype, + dtype=dtype, bits=config.bits, - use_sym=config.use_sym, + use_sym=config.sym, group_size=config.group_size, - enable_full_range=config.enable_full_range, - batch_size=config.batch_size, - lr_scheduler=config.lr_scheduler, use_quant_input=config.use_quant_input, - enable_minmax_tuning=config.enable_minmax_tuning, lr=config.lr, minmax_lr=config.minmax_lr, - low_gpu_mem_usage=config.low_gpu_mem_usage, - iters=config.iters, - seqlen=config.seq_len, - n_samples=config.n_samples, - sampler=config.sampler, - seed=config.seed, - n_blocks=config.n_blocks, - gradient_accumulate_steps=config.gradient_accumulate_steps, - not_use_best_mse=config.not_use_best_mse, - dynamic_max_gap=config.dynamic_max_gap, + seqlen=config.max_input_length, + n_samples=config.nsamples, scale_dtype=config.scale_dtype, - white_list=config.white_list, ) - quant_config.set_local(".*lm_head", AutoRoundConfig(w_dtype="fp32", act_dtype="fp32")) - quant_config.set_local(".*output_layer", AutoRoundConfig(w_dtype="fp32", act_dtype="fp32")) - quant_config.set_local(".*embed_out", AutoRoundConfig(w_dtype="fp32", act_dtype="fp32")) + quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) + quant_config.set_local(".*output_layer", AutoRoundConfig(dtype="fp32")) + quant_config.set_local(".*embed_out", AutoRoundConfig(dtype="fp32")) logger.info(f"Do AutoRound with config {quant_config}") - run_fn = get_autoround_default_run_fn run_args = ( - tokenizer, - dataset, + config.tokenizer, + config.dataset, quant_config.n_samples, - quant_config.seq_len, + quant_config.seqlen, quant_config.seed, quant_config.batch_size, "train" ) - inc_model = prepare(model=model, quant_config=quant_config) + model = prepare(model=model, quant_config=quant_config) run_fn(model, *run_args) - inc_model = convert(inc_model) - inc_model.eval() - + model = convert(model) else: assert False, "The Supported algorithm are RTN, AWQ, TEQ, GPTQ, AUTOROUND" if device == "xpu" or device == torch.device("xpu"): - model = inc_model.export_compressed_model( + model = model.export_compressed_model( compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False, @@ -586,18 +580,9 @@ def default_calib_func(model): q_model = replace_linear(model, None, None, config, device=device) else: - if config.weight_dtype not in ["nf4", "fp4", "int4_fullrange"]: - inc_model = inc_model.export_compressed_model(use_optimum_format=True) - inc_model.eval() - if config.use_ipex: - optimum_format_state_dict = inc_model.state_dict() - q_model = replace_linear(inc_model, None, None, config, device=device) - if config.use_ipex: - setattr(q_model, "optimum_format_state_dict", optimum_format_state_dict) - else: - q_model = replace_linear( - inc_model.model, None, None, config, device=device - ) + model.eval() + + q_model = replace_linear(model, None, None, config, device=device) if orig_dtype != torch.float32: q_model.to(dtype=orig_dtype) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index c7d0ae80ed4..9de8d3c5335 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -804,6 +804,7 @@ def __init__( bits: int = 4, tokenizer: Any = None, dataset: str = "NeelNanda/pile-10k", + batch_size: int = 8, group_size: int = 32, compute_dtype: Any = None, weight_dtype: Any = None, @@ -833,6 +834,7 @@ def __init__( self.bits = bits self.tokenizer = tokenizer self.dataset = dataset + self.batch_size = batch_size self.compute_dtype = compute_dtype self.weight_dtype = weight_dtype self.scale_dtype = scale_dtype @@ -1052,6 +1054,7 @@ def __init__( use_double_quant=False, double_quant_scale_dtype=None, # reserve for double quant sym: bool = True, + max_input_length: int = 2048, lr: float = 0.0025, minmax_lr: float = 0.0025, use_quant_input: bool = True, @@ -1082,7 +1085,6 @@ def __init__( self.lr = lr self.minmax_lr = minmax_lr self.use_quant_input = use_quant_input - self.iters = iters self.llm_int8_skip_modules = ( llm_int8_skip_modules if llm_int8_skip_modules else [] ) @@ -1090,7 +1092,8 @@ def __init__( self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") self.calib_dataloader = kwargs.get("calib_dataloader", None) - self.calib_len = kwargs.get("calib_len", None) + self.batch_size = kwargs.get("batch_size", 1) + self.max_input_length = max_input_length self.calib_func = kwargs.get("calib_func", None) self.calib_iters = kwargs.get("calib_iters", 100) self.scheme = "sym" if self.sym else "asym" From cbfd1beeac9f92bb73aae134383454935c70d6da Mon Sep 17 00:00:00 2001 From: "Ye, Xinyu" Date: Fri, 17 May 2024 02:28:42 -0400 Subject: [PATCH 03/28] migrated RTN to use INC3.x API. Signed-off-by: Ye, Xinyu --- .../transformers/llm/quantization/utils.py | 28 ++++++++++++++++--- .../transformers/utils/config.py | 16 +++++++++-- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index cf2733167c1..2bc0425e65b 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -462,7 +462,7 @@ def collate_batch(batch): ) except ValueError: pass - + def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): import intel_extension_for_pytorch @@ -490,9 +490,29 @@ def convert_to_quantized_model(model, config, device="cpu"): dtype = config.weight_dtype # mapping to INC config if config.quant_method.value == "rtn": + export_compressed_model = False + if (device == "cpu" or device == torch.device("cpu")) \ + and config.weight_dtype not in ["nf4", "fp4", "int4_fullrange"]: + export_compressed_model = True quant_config = RTNConfig( - + dtype=config.weight_dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + group_dim=config.group_dim, + use_full_range=config.use_full_range, + use_mse_search=config.mse_range, + export_compressed_model=export_compressed_model, + use_layer_wise=config.layer_wise, + model_path=config.model_path, + use_double_quant=config.use_double_quant, + double_quant_dtype=config.double_quant_dtype, + double_quant_bits=config.double_quant_bits, + double_quant_use_sym=config.double_quant_use_sym, + double_quant_group_size=config.double_quant_group_size, ) + model = prepare(model, quant_config) + model = convert(model) elif config.quant_method.value == "hqq": quant_config = HQQConfig( @@ -500,7 +520,7 @@ def convert_to_quantized_model(model, config, device="cpu"): elif config.quant_method.value == "awq": quant_config = AWQConfig( - ) + ) elif config.quant_method.value == "teq": quant_config = TEQConfig( dtype=dtype, @@ -535,7 +555,7 @@ def convert_to_quantized_model(model, config, device="cpu"): ) model = prepare(model=model, quant_config=quant_config) run_fn(model, *run_args) - model = convert(model) + model = convert(model) elif config.quant_method.value == "autoround": quant_config = AutoRoundConfig( dtype=dtype, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 9de8d3c5335..04aa729f207 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -738,14 +738,20 @@ def __init__( self, bits: int = 4, group_size: int = 32, + group_dim: int = 1, compute_dtype: Any = None, weight_dtype: Any = None, scale_dtype: Any = None, + use_full_range: bool = False, mse_range: bool = False, use_double_quant=False, - double_quant_scale_dtype=None, # reserve for double quant + double_quant_dtype: str = "int", + double_quant_bits: int = 8, + double_quant_use_sym: bool = False, + double_quant_group_size: int = 256, sym: bool = True, layer_wise: bool = False, + model_path: str = "", use_ggml: bool = False, use_quant: bool = True, use_neural_speed: bool = False, @@ -754,16 +760,22 @@ def __init__( ): self.quant_method = QuantizationMethod.RTN self.bits = bits + self.use_full_range = use_full_range self.mse_range = mse_range self.compute_dtype = compute_dtype self.weight_dtype = weight_dtype self.scale_dtype = scale_dtype self.group_size = group_size + self.group_dim = group_dim self.layer_wise = layer_wise + self.model_path = model_path self.sym = sym self.scheme = "sym" if self.sym else "asym" self.use_double_quant = use_double_quant - self.double_quant_scale_dtype = double_quant_scale_dtype + self.double_quant_dtype = double_quant_dtype + self.double_quant_bits = double_quant_bits + self.double_quant_use_sym = double_quant_use_sym + self.double_quant_group_size = double_quant_group_size self.llm_int8_skip_modules = ( llm_int8_skip_modules if llm_int8_skip_modules else [] ) From 44d0ced86d5f9fd4d1bf871185a7cce76016186f Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 20 May 2024 03:22:41 -0700 Subject: [PATCH 04/28] migrate sq with INC 3.x Signed-off-by: changwangss --- .../quantization/run_generation_cpu_woq.py | 19 +- .../quantization/run_generation_sq.py | 92 +-- .../examples/finetuning/multi_modal/train.py | 2 +- .../llm/quantization/nn/modules.py | 6 + .../transformers/llm/quantization/sq_utils.py | 223 +++++++ .../transformers/llm/quantization/utils.py | 120 +++- .../transformers/modeling/modeling_auto.py | 562 +++++------------- .../transformers/utils/config.py | 62 +- tests/CI/test_quantization.py | 308 +++++----- 9 files changed, 708 insertions(+), 686 deletions(-) create mode 100644 intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index a373f36e848..5987684d870 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -276,11 +276,10 @@ compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, - calib_len=args.calib_len, lr=args.lr, minmax_lr=args.minmax_lr, use_quant_input=args.use_quant_input, + max_input_length=args.max_input_length, use_ipex=args.use_ipex, ) else: @@ -316,11 +315,11 @@ print("Didn't do Weight Only Quantization.") # save model -if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): - user_model.save_pretrained(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - # to validate woq model accuracy - args.model = args.output_dir +# if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): +# user_model.save_pretrained(args.output_dir) +# tokenizer.save_pretrained(args.output_dir) +# # to validate woq model accuracy +# args.model = args.output_dir if args.benchmark: print("Loading model from: ", args.model) @@ -392,9 +391,9 @@ if args.use_neural_speed: model_args += ",model_format=neural_speed" args = LMEvalParser(model = "hf", - model_args=model_args, - #user_model=user_model, - #tokenizer=tokenizer, + #model_args=model_args, + user_model=user_model, + tokenizer=tokenizer, tasks = args.tasks, device = "cpu", batch_size = args.batch_size) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index 9296c2b9101..39ec9e46353 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -58,33 +58,16 @@ parser.add_argument("--mixed_precision", action="store_true") # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") -parser.add_argument("--calib_iters", default=100, type=int, help="Calibration iters.") -parser.add_argument( - "--calib_padding", action="store_true", help="Calibration dataset do padding." -) -parser.add_argument( - "--calib_shuffle", - default=True, - type=str2bool, - help="Calibration dataset do shuffle.", -) -parser.add_argument( - "--calib_pad_val", default=1, type=int, help="Calibration dataset padding value." -) -parser.add_argument( - "--calib_len", - default=512, - type=int, - help="Calibration dataset max or padding max length.", -) -parser.add_argument( - "--recipes", type=str, help="A dictionary as a string, recipes for smoothquant." -) -parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.") -parser.add_argument( - "--fallback_add", action="store_true", help="Whether to fallback add ops to FP32" -) - +parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") +parser.add_argument("--nsamples", default=100, help="Smooth quant calibration samples.") +# sq alpha "auto" parameters +parser.add_argument("--scale_sharing", action="store_true") +parser.add_argument("--init_alpha", default="0.5", help="Smooth quant parameter.") +parser.add_argument("--alpha_min", default="0.0", help="Smooth quant parameter.") +parser.add_argument("--alpha_max", default="1.0", help="Smooth quant parameter.") +parser.add_argument("--alpha_step", default="0.1", help="Smooth quant parameter.") +parser.add_argument("--shared_criterion", default="max", type=str) +parser.add_argument("--do_blockwise", action="store_true") # ============AutoModel parameters============== parser.add_argument("--_commit_hash", default=None, type=str) parser.add_argument("--trust_remote_code", action="store_true") @@ -142,56 +125,19 @@ if args.mixed_precision: quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 elif args.sq: - if re.search("gptj", config.model_type) or re.search("gpt_neox", config.model_type): - op_type_dict = { - "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, - } - elif re.search("mpt", config.model_type): - op_type_dict = { - "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, - "": { - "weight": {"dtype": ["fp32"]}, - "activation": {"dtype": ["fp32"]}, - }, - } - elif re.search("mistral", config.model_type) or re.search( - "baichuan", config.model_type - ): - op_type_dict = {".*": {"activation": {"algorithm": "minmax"}}} - else: - op_type_dict = {} - if args.fallback_add: - op_type_dict["add"] = { - "weight": {"dtype": ["fp32"]}, - "activation": {"dtype": ["fp32"]}, - } excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - if args.recipes: - try: - import ast - - recipes = ast.literal_eval(args.recipes) - print("Parsed recipes dictionary:", recipes) - except ValueError as e: - print("Error parsing recipes dictionary:", e) - else: - recipes = { - "smooth_quant": True, - "smooth_quant_args": { - "alpha": args.alpha if args.alpha == "auto" else float(args.alpha) - }, - } quantization_config = SmoothQuantConfig( tokenizer=tokenizer, # either two of one, tokenizer or calib_func - recipes=recipes, - op_type_dict=op_type_dict, # default is {} excluded_precisions=excluded_precisions, # default is [] + alpha = args.alpha, + scale_sharing = args.scale_sharing, + init_alpha=args.init_alpha, + alpha_min=args.alpha_min, + alpha_max=args.alpha_max, + alpha_step=args.alpha_step, + shared_criterion=args.shared_criterion, + do_blockwise = args.do_blockwise, num_beams=generate_kwargs["num_beams"], - calib_shuffle=args.calib_shuffle, - calib_iters=args.calib_iters, - calib_padding=args.calib_padding, - calib_len=args.calib_len, - calib_pad_val=args.calib_pad_val, ) else: print("The quantization_config is None.") @@ -210,7 +156,7 @@ tokenizer.save_pretrained(args.output_dir) if args.sq: config.save_pretrained(args.output_dir) - user_model.save(args.output_dir) + torch.jit.save(user_model, args.output_dir + "/pytorch_model.bin") elif args.mixed_precision: user_model.save_pretrained(args.output_dir) args.model = args.output_dir diff --git a/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/train.py b/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/train.py index 75d6d236bf1..dfefe86c6f6 100644 --- a/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/train.py +++ b/intel_extension_for_transformers/neural_chat/examples/finetuning/multi_modal/train.py @@ -145,7 +145,7 @@ def train(): quantization_config = BitsAndBytesConfig( load_in_4bit=training_args.bits == 4, load_in_8bit=training_args.bits == 8, - llm_int8_skip_modules=["mm_projector"], + modules_to_not_convert=["mm_projector"], llm_int8_threshold=6.0, llm_int8_has_fp16_weight=False, bnb_4bit_compute_dtype=compute_dtype, diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py index 38586062804..40812c90510 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py @@ -194,6 +194,8 @@ def set_weights_bias( if q_config.quant_method.value == "gptq": if q_config.desc_act: + print("before qbits g_idx") + print(g_idx) if not q_config.static_groups: int_weight2 = int_weight.clone() group_size = q_config.group_size @@ -325,7 +327,11 @@ def recover_int_weight(g_idx, int_weight): desc_act = qbits.acquire_packed_weight_info(self.weight, 4)[0] != 0 if desc_act: g_idx = qbits.acquire_packed_weight_info(self.weight, 5) + print("qbits recover g_idx") + print(g_idx) g_idx = recover_idx(g_idx, in_features, group_size) + print("postprocess recover g_idx") + print(g_idx) else: g_idx = None weight_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 6) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py new file mode 100644 index 00000000000..132f0b3c5dc --- /dev/null +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...utils import ( + logger, + LazyImport, +) +from datasets import load_dataset +from torch.utils.data import DataLoader +from torch.nn.functional import pad + +torch = LazyImport("torch") +IPEX_OPT_LLM_SUPPORTED_DICT = {"2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], + "2.3": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "bloom", "codegen", "gptbigcode", "t5", "mixtral", "mpt"]} + +MODEL_TYPES_REQUIRING_POSITION_IDS = { + "codegen", + "gpt2", + "gpt-bigcode", + "gpt-neo", + "gpt-neox", + "gptj", + "imagegpt", + "llama", + "mistral", + "chatglm", + "baichuan" +} + +def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): + """Generate the dummy past_key_values.""" + from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": + new_shape = [input_bs, 1, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + elif config.model_type == "baichuan": + new_shape = [input_bs, config.num_attention_heads, 1, normalized_config.hidden_size//config.num_attention_heads] + elif config.model_type == "chatglm": + new_shape = [1, input_bs, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + else: + normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.model_type + )(config) + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_key_value_heads = num_attention_heads + nb_pkv = 2 + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + if config.model_type == "bloom": + for nb_pkv in range(nb_pkv): + if nb_pkv % 2 == 0: + new_shape = [input_bs * num_key_value_heads, d_k, 1] + else: + new_shape = [input_bs * num_key_value_heads, 1, d_k] + + else: + new_shape = [input_bs, num_key_value_heads, 1, d_k] + + beam_idx_tmp = torch.zeros( + (2048, int(input_bs * num_beams)), dtype=torch.long + ).contiguous() + past_key_values = [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + beam_idx_tmp, + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + +def generate_dummy_past_key_values(config, input_bs): + """Generate the dummy past_key_values.""" + from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": + new_shape = [input_bs, 1, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + elif config.model_type == "baichuan": + new_shape = [input_bs, config.num_attention_heads, 1, normalized_config.hidden_size//config.num_attention_heads] + elif config.model_type == "chatglm": + new_shape = [1, input_bs, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + else: + normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.model_type + )(config) + nb_pkv = 2 + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + num_key_value_heads = num_attention_heads + if hasattr(normalized_config, "num_key_value_heads"): + num_key_value_heads = normalized_config.num_key_value_heads + if hasattr(normalized_config, "multi_query_group_num"): + num_key_value_heads = normalized_config.multi_query_group_num + + if config.model_type == "bloom": + shape_key = (input_bs * num_attention_heads, d_k, 1) + shape_value = (input_bs * num_attention_heads, 1, d_k) + key = torch.ones(size=shape_key) + value = torch.ones(size=shape_value) + past_key_values = tuple( + tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) + for _ in range(num_layers) + ) + return past_key_values + elif config.model_type == "gpt_bigcode": + new_shape = [input_bs, 0, d_k * 2] + dummy_tensor = torch.zeros(size=new_shape) + past_key_values = tuple([dummy_tensor] * num_layers) + return past_key_values + elif config.model_type == "falcon": + new_shape = [input_bs, 1, 0, d_k] + else: + new_shape = [input_bs, num_key_value_heads, 0, d_k] + past_key_values = [ + ( + torch.zeros(size=new_shape).contiguous(), + torch.zeros(size=new_shape).contiguous(), + ) + for _ in range(num_layers) + ] + return tuple(past_key_values) + +def get_dataloader(model_type, quantization_config, past_key_values, shuffle=False, padding=False, max_input_length=512, pad_val=None): + calib_dataset = load_dataset( + quantization_config.dataset, + split=( + "test" + if quantization_config.dataset in ["mbpp", "openai_humaneval"] + else "train" + ), +) + if shuffle: + calib_dataset = calib_dataset.shuffle(seed=42) + + def tokenize_function(examples): + if "code" in examples: + example = quantization_config.tokenizer(examples["code"]) + elif "prompt" in examples: + example = quantization_config.tokenizer(examples["prompt"]) + elif "text" in examples: + example = quantization_config.tokenizer(examples["text"]) + else: + logger.error( + "Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset." + ) + exit(0) + return example + + def collate_batch(batch): + position_ids_padded = [] + input_ids_padded = [] + last_ind = [] + attention_mask_padded = [] + for text in batch: + input_ids = text["input_ids"] + if not padding: + input_ids = ( + input_ids[: int(max_input_length)] + if len(input_ids) > int(max_input_length) + else input_ids + ) # no_padding + else: + pad_len = max_input_length - input_ids.shape[0] + input_ids = pad( + input_ids, (0, pad_len), value=max_input_length + ) + + last_ind.append(input_ids.shape[0] - 1) + attention_mask = torch.ones(len(input_ids)) + position_ids = torch.arange(len(input_ids)) + input_ids_padded.append(input_ids) + attention_mask_padded.append(attention_mask) + position_ids_padded.append(position_ids) + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + return ( + { + "input_ids": torch.vstack(input_ids_padded), + "attention_mask": torch.vstack(attention_mask_padded), + "position_ids": torch.vstack(position_ids_padded), + "past_key_values": past_key_values, + }, + torch.tensor(last_ind), + ) + else: + return ( + { + "input_ids": torch.vstack(input_ids_padded), + "attention_mask": torch.vstack(attention_mask_padded), + "past_key_values": past_key_values, + }, + torch.tensor(last_ind), + ) + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) + return calib_dataloader \ No newline at end of file diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 2bc0425e65b..a3763920971 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -24,7 +24,6 @@ from accelerate import init_empty_weights from datasets import load_dataset from neural_compressor import quantization -#from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.utils.utility import LazyImport from neural_compressor.torch.algorithms.weight_only.autoround import get_autoround_default_run_fn @@ -34,19 +33,24 @@ GPTQConfig, AWQConfig, TEQConfig, - StaticQuantConfig, SmoothQuantConfig, HQQConfig, convert, - get_default_AutoRound_config, - prepare + prepare, + quantize +) +from .sq_utils import ( + IPEX_OPT_LLM_SUPPORTED_DICT, + MODEL_TYPES_REQUIRING_POSITION_IDS, + generate_dummy_past_key_values_for_opt_llm, + generate_dummy_past_key_values, + get_dataloader ) from intel_extension_for_transformers.tools.utils import ( is_ipex_available, is_autoround_available, ) from transformers import AutoTokenizer - if is_ipex_available(): import intel_extension_for_pytorch as ipex @@ -387,7 +391,7 @@ def _replace_linear( return model, is_replaced -def default_run_fn(model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="GPTQ"): +def default_run_fn(model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="rtn"): from datasets import load_dataset from torch.utils.data import DataLoader @@ -647,3 +651,107 @@ def get_bits(config): config.weight_dtype ) return bits + +def convert_to_smoothquant_model(model, quantization_config): + if ipex.__version__ == "2.2.0+cpu": + logger.info("ipex.llm.optimize by 2.2.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])) + logger.info("The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] + elif ipex.__version__ == "2.3.0+cpu": + logger.info("ipex.llm.optimize by 2.3.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])) + logger.info("The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] + else: + logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] + model_type = model.config.model_type.replace("_", "-") + # ipex.optimize_transformers + if quantization_config.ipex_opt_llm is None: + if model_type in IPEX_OPT_LLM_SUPPORTED: + quantization_config.ipex_opt_llm = True + logger.info( + "quantization_config.ipex_opt_llm set to True and ipex.llm.optimize is used." + ) + else: + quantization_config.ipex_opt_llm = False + if quantization_config.ipex_opt_llm: + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5) + model = ipex.llm.optimize( + model.eval(), + quantization_config=qconfig, + dtype=torch.float32, + inplace=True, + deployment_mode=False, + ) + model.eval() + # past_key_values + num_beams = quantization_config.num_beams + if quantization_config.ipex_opt_llm: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=model.config, input_bs=1, num_beams=num_beams + ) + else: + past_key_values = generate_dummy_past_key_values( + config=model.config, input_bs=1 + ) + # get calibration dataloader + if quantization_config.alpha == "auto" and model_type == "llama": + calib_dataloader = get_dataloader(model_type, quantization_config, past_key_values=past_key_values, shuffle=True, padding=True, max_input_lenth=2048, pad_val=1) + else: + calib_dataloader = get_dataloader(model_type, quantization_config, past_key_values=past_key_values) + + def calib_func(model): + with torch.no_grad(): + for i, (inputs, last_ind) in enumerate(calib_dataloader): + if i >= quantization_config.nsamples: + break + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + model( + input_ids=inputs["input_ids"], + past_key_values=inputs["past_key_values"], + position_ids=inputs["position_ids"], + attention_mask=inputs["attention_mask"], + ) + else: + model( + input_ids=inputs["input_ids"], + past_key_values=inputs["past_key_values"], + attention_mask=inputs["attention_mask"], + ) + + # example_inputs + for i, (inputs, last_ind) in enumerate(calib_dataloader): + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + example_inputs = { + "input_ids": inputs["input_ids"], + "attention_mask": inputs["attention_mask"], + "position_ids": inputs["position_ids"], + "past_key_values": inputs["past_key_values"], + } + else: + example_inputs = { + "input_ids": inputs["input_ids"], + "attention_mask": inputs["attention_mask"], + "past_key_values": inputs["past_key_values"], + } + break + quant_config = SmoothQuantConfig( + alpha=quantization_config.alpha, + init_alpha=quantization_config.init_alpha, + alpha_min=quantization_config.alpha_min, + alpha_max=quantization_config.alpha_max, + alpha_step=quantization_config.alpha_step, + shared_criterion=quantization_config.shared_criterion, + do_blockwise=quantization_config.do_blockwise, + + ) + # fallback + if model_type in ["gptj", "gpt_neox", "mpt"]: + quant_config = quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) + q_model = quantize(model, quant_config=quant_config, run_fn=calib_func, example_inputs=example_inputs) + with torch.no_grad(): + q_model = torch.jit.trace(q_model.eval(), example_kwarg_inputs=example_inputs, strict=False, check_trace=False) + q_model = torch.jit.freeze(q_model.eval()) + q_model(**example_inputs) + q_model(**example_inputs) + return q_model \ No newline at end of file diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index cca3b40abcf..fae551a838e 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -68,6 +68,7 @@ convert_dtype_str2torch, convert_dtype_torch2str, convert_to_quantized_model, + convert_to_smoothquant_model, replace_linear, ) from ...tools.utils import is_intel_gpu_available, is_ipex_available @@ -731,7 +732,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): assert ( ipex.__version__ >= "2.2.0+cpu" ), "Please use Intel Extension for PyTorch >=2.2.0+cpu." - config.torchscript = True config.use_cache = True model = cls.ORIG_MODEL.from_pretrained( @@ -742,7 +742,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): torch_dtype=torch.float, **kwargs, ) - if ( not torch.cuda.is_available() or device_map == "cpu" @@ -750,261 +749,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) and model.config.model_type == "chatglm": model = model.float() model.eval() - model_type = model.config.model_type.replace("_", "-") - if "llama" in model_type and transformers.__version__ >= "4.36.0": - quantization_config.ipex_opt_llm = False logger.info("Applying SmoothQuant.") - # ipex.optimize_transformers - if quantization_config.ipex_opt_llm is None: - if model_type in IPEX_OPT_LLM_SUPPORTED: - quantization_config.ipex_opt_llm = True - logger.info( - "quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used." - ) - logger.warning("The suggested transformers version is 4.35.2.") - else: - quantization_config.ipex_opt_llm = False - if quantization_config.ipex_opt_llm: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5) - model = ipex.optimize_transformers( - model.eval(), - quantization_config=qconfig, - dtype=torch.float32, - inplace=True, - deployment_mode=False, - ) - model.eval() - - # past_key_values - num_beams = quantization_config.num_beams - if quantization_config.ipex_opt_llm: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=model.config, input_bs=1, num_beams=num_beams - ) - else: - past_key_values = generate_dummy_past_key_values( - config=model.config, input_bs=1 - ) - - # calibration function - calib_func = quantization_config.calib_func - tokenizer = quantization_config.tokenizer - if calib_func is None: - if quantization_config.tokenizer is None: - logger.error( - "Please provide the tokenizer or provide calib_func directly," - + " the following is how to get tokenizer. \n" - + " from transformer import AutoTokenizer \n" - + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - - from datasets import load_dataset - from torch.utils.data import DataLoader - - calib_dataset = quantization_config.calib_dataset - calib_shuffle = quantization_config.calib_shuffle - calib_iters = quantization_config.calib_iters - calib_padding = quantization_config.calib_padding - calib_len = quantization_config.calib_len - calib_pad_val = quantization_config.calib_pad_val - from torch.nn.functional import pad - - calib_dataset = load_dataset( - calib_dataset, - split=( - "test" - if calib_dataset in ["mbpp", "openai_humaneval"] - else "train" - ), - ) - if calib_shuffle: - calib_dataset = calib_dataset.shuffle(seed=42) - - def tokenize_function(examples): - if "code" in examples: - example = tokenizer(examples["code"]) - elif "prompt" in examples: - example = tokenizer(examples["prompt"]) - elif "text" in examples: - example = tokenizer(examples["text"]) - else: - logger.error( - "Please check dataset prompt identifier," - + " NeelNanda/pile-10k is default used calibration dataset." - ) - exit(0) - return example - - def collate_batch(batch): - position_ids_padded = [] - input_ids_padded = [] - last_ind = [] - attention_mask_padded = [] - for text in batch: - input_ids = text["input_ids"] - if not calib_padding: - input_ids = ( - input_ids[: int(calib_len)] - if len(input_ids) > int(calib_len) - else input_ids - ) # no_padding - else: - pad_len = calib_len - input_ids.shape[0] - input_ids = pad( - input_ids, (0, pad_len), value=calib_pad_val - ) - - last_ind.append(input_ids.shape[0] - 1) - if model_type in ["bloom", "qwen"]: - attention_mask = torch.ones(len(input_ids) + 1) - attention_mask[0] = 0 - else: - attention_mask = torch.ones(len(input_ids)) - position_ids = torch.arange(len(input_ids)) - input_ids_padded.append(input_ids) - attention_mask_padded.append(attention_mask) - position_ids_padded.append(position_ids) - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - return ( - { - "input_ids": torch.vstack(input_ids_padded), - "attention_mask": torch.vstack(attention_mask_padded), - "position_ids": torch.vstack(position_ids_padded), - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - else: - return ( - { - "input_ids": torch.vstack(input_ids_padded), - "attention_mask": torch.vstack(attention_mask_padded), - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - - def collate_batch_for_chatglm(batch): - last_ind = [] - for text in batch: - input_ids = torch.vstack([text["input_ids"]]) - if re.search( - "THUDM/chatglm-6b", model.config.auto_map["AutoConfig"] - ): - input_ids = ( - input_ids[:, :calib_len] - if input_ids.shape[1] > calib_len - else input_ids - ) - eos = torch.tensor([130001, 130004]).repeat(1, 1) - input_ids = torch.cat((input_ids, eos), 1) - else: - input_ids = ( - input_ids[:, :calib_len] - if input_ids.shape[1] > calib_len - else input_ids - ) - prepared_inputs = model.prepare_inputs_for_generation(input_ids) - attention_mask = torch.ones_like(input_ids) - last_ind.append(input_ids.shape[1] - 1) - return ( - { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": prepared_inputs["position_ids"], - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - - tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - if model_type == "chatglm": - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch_for_chatglm, - ) - else: - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) - - def calib_func(model): - with torch.no_grad(): - for i, (inputs, last_ind) in enumerate(calib_dataloader): - if i >= calib_iters: - break - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - model( - input_ids=inputs["input_ids"], - past_key_values=inputs["past_key_values"], - position_ids=inputs["position_ids"], - attention_mask=inputs["attention_mask"], - ) - else: - model( - input_ids=inputs["input_ids"], - past_key_values=inputs["past_key_values"], - attention_mask=inputs["attention_mask"], - ) - - logger.info( - "The default calibration function is used, " - + "the calibration dataset is NeelNanda/pile-10k, " - + "batchsize is 1 and calibration iteration is 100." - ) - calib_func = calib_func - - # example_inputs - example_inputs = quantization_config.example_inputs - if example_inputs is None: - for i, (inputs, last_ind) in enumerate(calib_dataloader): - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - example_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "position_ids": inputs["position_ids"], - "past_key_values": inputs["past_key_values"], - } - else: - example_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "past_key_values": inputs["past_key_values"], - } - break - - # call inc smoothquant - from neural_compressor.torch.quantization import SmoothQuantConfig, quantize - quant_config = SmoothQuantConfig( - w_dtype=quantization_config.w_dtype, - w_sym=quantization_config.w_sym, - w_granularity=quantization_config.w_granularity, - w_algo=quantization_config.w_algo, - act_dtype=quantization_config.act_dtype, - alpha=quantization_config.alpha, - folding=quantization_config.folding, - scale_sharing=quantization_config.scale_sharing, - init_alpha=quantization_config.init_alpha, - alpha_min=quantization_config.alpha_min, - alpha_step=quantizate_config.alpha_step, - shared_criterion=quantization_config.shared_criterion, - do_blockwise=quantizate_config.do_blockwise, - auto_alpha_args=quantizate_config.auto_alpha_args, - white_list=quantizate_config.white_list, - ) - - model = quantize(model, - quant_config=quant_config, - run_fn=run_fn, - example_inputs=example_inputs - ) + model = convert_to_smoothquant_model(model, quantization_config) logger.info("SmoothQuant done.") elif isinstance(quantization_config, DynamicQuantConfig): model = cls.ORIG_MODEL.from_pretrained( @@ -1042,158 +788,158 @@ def calib_func(model): model.quantization_config = quantization_config logger.info("DynamicQuant done.") return model - elif isinstance(quantization_config, StaticQuantConfig): - if quantization_config.backend == "ipex": - try: - import intel_extension_for_pytorch as ipex - except ImportError: - logger.warning( - "Please install Intel Extension for PyTorch to accelerate the model inference." - ) - config.torchscript = True - assert quantization_config.example_inputs is not None, \ - "Please provide example_inputs for IPEX static quantization." - - model = cls.ORIG_MODEL.from_pretrained( - pretrained_model_name_or_path, - *model_args, - config=config, - low_cpu_mem_usage=True, - torch_dtype=torch.float, - **kwargs, - ) - - if ( - not torch.cuda.is_available() - or device_map == "cpu" - or device_map == torch.device("cpu") - ) and model.config.model_type == "chatglm": - model = model.float() - model.eval() - logger.info("Applying StaticQuant.") - # calibration function - calib_func = quantization_config.calib_func - tokenizer = quantization_config.tokenizer - if calib_func is None: - if quantization_config.tokenizer is None: - logger.error( - "Please provide the tokenizer or provide calib_func directly," - + " the following is how to get tokenizer. \n" - + " from transformer import AutoTokenizer \n" - + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - - from datasets import load_dataset - from torch.utils.data import DataLoader - - calib_dataset = quantization_config.calib_dataset - calib_shuffle = quantization_config.calib_shuffle - calib_iters = quantization_config.calib_iters - calib_padding = quantization_config.calib_padding - calib_len = quantization_config.calib_len - calib_pad_val = quantization_config.calib_pad_val - from torch.nn.functional import pad - - calib_dataset = load_dataset( - calib_dataset, - split=( - "test" - if calib_dataset in ["mbpp", "openai_humaneval"] - else "train" - ), - ) - if calib_shuffle: - calib_dataset = calib_dataset.shuffle(seed=42) - - def tokenize_function(examples): - if "code" in examples: - example = tokenizer(examples["code"]) - elif "prompt" in examples: - example = tokenizer(examples["prompt"]) - elif "text" in examples: - example = tokenizer(examples["text"]) - else: - logger.error( - "Please check dataset prompt identifier," - + " NeelNanda/pile-10k is default used calibration dataset." - ) - exit(0) - return example - - def collate_batch(batch): - input_ids_padded = [] - last_ind = [] - for text in batch: - input_ids = text["input_ids"] - if not calib_padding: - input_ids = ( - input_ids[: int(calib_len)] - if len(input_ids) > int(calib_len) - else input_ids - ) # no_padding - else: - pad_len = calib_len - input_ids.shape[0] - input_ids = pad( - input_ids, (0, pad_len), value=calib_pad_val - ) - - last_ind.append(input_ids.shape[0] - 1) - input_ids_padded.append(input_ids) - - return ( - { - "input_ids": torch.vstack(input_ids_padded), - }, - torch.tensor(last_ind), - ) - - - tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) - - def calib_func(model): - with torch.no_grad(): - for i, (inputs, last_ind) in enumerate(calib_dataloader): - if i >= calib_iters: - break - model(**inputs) - - logger.info( - "The default calibration function is used, " - + "the calibration dataset is NeelNanda/pile-10k, " - + "batchsize is 1 and calibration iteration is 100." - ) - calib_func = calib_func - - - # call inc static quant - from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare - quant_config = StaticQuantConfig( - w_dtype=quantization_config.w_dtype, - w_sym=quantization_config.w_sym, - w_granularity=quantization_config.w_granularity, - w_algo=quantization_config.w_algo, - act_dtype=quantization_config.act_dtype, - act_sym=quantization_config.act_sym, - act_granularity=quantization_config.act_granularity, - act_algo=quantization_config.act_algo, - white_list=quantizate_config.white_list, - ) - prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) - calib_func(prepared_model) - q_model = convert(prepared_model) - model.save_pretrained = types.MethodType(save_low_bit, model) - quantization_config.remove_redundant_parameters() - model.quantization_config = quantization_config - logger.info("StaticQuant done.") - return model + # elif isinstance(quantization_config, StaticQuantConfig): + # if quantization_config.backend == "ipex": + # try: + # import intel_extension_for_pytorch as ipex + # except ImportError: + # logger.warning( + # "Please install Intel Extension for PyTorch to accelerate the model inference." + # ) + # config.torchscript = True + # assert quantization_config.example_inputs is not None, \ + # "Please provide example_inputs for IPEX static quantization." + + # model = cls.ORIG_MODEL.from_pretrained( + # pretrained_model_name_or_path, + # *model_args, + # config=config, + # low_cpu_mem_usage=True, + # torch_dtype=torch.float, + # **kwargs, + # ) + + # if ( + # not torch.cuda.is_available() + # or device_map == "cpu" + # or device_map == torch.device("cpu") + # ) and model.config.model_type == "chatglm": + # model = model.float() + # model.eval() + # logger.info("Applying StaticQuant.") + # # calibration function + # calib_func = quantization_config.calib_func + # tokenizer = quantization_config.tokenizer + # if calib_func is None: + # if quantization_config.tokenizer is None: + # logger.error( + # "Please provide the tokenizer or provide calib_func directly," + # + " the following is how to get tokenizer. \n" + # + " from transformer import AutoTokenizer \n" + # + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + # ) + # exit(0) + + # from datasets import load_dataset + # from torch.utils.data import DataLoader + + # calib_dataset = quantization_config.calib_dataset + # calib_shuffle = quantization_config.calib_shuffle + # calib_iters = quantization_config.calib_iters + # calib_padding = quantization_config.calib_padding + # calib_len = quantization_config.calib_len + # calib_pad_val = quantization_config.calib_pad_val + # from torch.nn.functional import pad + + # calib_dataset = load_dataset( + # calib_dataset, + # split=( + # "test" + # if calib_dataset in ["mbpp", "openai_humaneval"] + # else "train" + # ), + # ) + # if calib_shuffle: + # calib_dataset = calib_dataset.shuffle(seed=42) + + # def tokenize_function(examples): + # if "code" in examples: + # example = tokenizer(examples["code"]) + # elif "prompt" in examples: + # example = tokenizer(examples["prompt"]) + # elif "text" in examples: + # example = tokenizer(examples["text"]) + # else: + # logger.error( + # "Please check dataset prompt identifier," + # + " NeelNanda/pile-10k is default used calibration dataset." + # ) + # exit(0) + # return example + + # def collate_batch(batch): + # input_ids_padded = [] + # last_ind = [] + # for text in batch: + # input_ids = text["input_ids"] + # if not calib_padding: + # input_ids = ( + # input_ids[: int(calib_len)] + # if len(input_ids) > int(calib_len) + # else input_ids + # ) # no_padding + # else: + # pad_len = calib_len - input_ids.shape[0] + # input_ids = pad( + # input_ids, (0, pad_len), value=calib_pad_val + # ) + + # last_ind.append(input_ids.shape[0] - 1) + # input_ids_padded.append(input_ids) + + # return ( + # { + # "input_ids": torch.vstack(input_ids_padded), + # }, + # torch.tensor(last_ind), + # ) + + + # tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + # tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + # calib_dataloader = DataLoader( + # tokenized_dataset, + # batch_size=1, + # shuffle=False, + # collate_fn=collate_batch, + # ) + + # def calib_func(model): + # with torch.no_grad(): + # for i, (inputs, last_ind) in enumerate(calib_dataloader): + # if i >= calib_iters: + # break + # model(**inputs) + + # logger.info( + # "The default calibration function is used, " + # + "the calibration dataset is NeelNanda/pile-10k, " + # + "batchsize is 1 and calibration iteration is 100." + # ) + # calib_func = calib_func + + + # # call inc static quant + # from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare + # quant_config = StaticQuantConfig( + # w_dtype=quantization_config.w_dtype, + # w_sym=quantization_config.w_sym, + # w_granularity=quantization_config.w_granularity, + # w_algo=quantization_config.w_algo, + # act_dtype=quantization_config.act_dtype, + # act_sym=quantization_config.act_sym, + # act_granularity=quantization_config.act_granularity, + # act_algo=quantization_config.act_algo, + # white_list=quantizate_config.white_list, + # ) + # prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) + # calib_func(prepared_model) + # q_model = convert(prepared_model) + # model.save_pretrained = types.MethodType(save_low_bit, model) + # quantization_config.remove_redundant_parameters() + # model.quantization_config = quantization_config + # logger.info("StaticQuant done.") + # return model elif isinstance(quantization_config, QuantAwareTrainingConfig): model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 04aa729f207..db45706ad37 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -688,50 +688,42 @@ def __init__( self.excluded_precisions = excluded_precisions self.example_inputs = example_inputs -class SmoothQuantConfig(StaticQuantConfig): +class SmoothQuantConfig(ITREXQuantizationConfigMixin): def __init__( self, - backend="ipex", tokenizer=None, - calib_dataset="NeelNanda/pile-10k", - calib_dataloader=None, - calib_func=None, - calib_shuffle=True, - calib_iters=100, - calib_padding=False, - calib_len=512, - calib_pad_val=1, - op_name_dict=None, - op_type_dict=None, + dataset="NeelNanda/pile-10k", + alpha=0.5, + scale_sharing = False, + init_alpha = 0.5, + alpha_min = 0.0, + alpha_max = 1.0, + alpha_step = 0.1, + shared_criterion = "max", + do_blockwise = False, + auto_alpha_args = None, + nsamples=100, excluded_precisions=[], - example_inputs=None, ipex_opt_llm=None, - alpha=0.5, - num_beams=1, - recipes={"smooth_quant": True, "smooth_quant_args":{"alpha":0.5}}, - **kwargs, + num_beams=1, + ): - super().__init__( - backend=backend, - tokenizer=tokenizer, - calib_dataset=calib_dataset, - calib_dataloader=calib_dataloader, - calib_func=calib_func, - calib_shuffle=calib_shuffle, - calib_iters=calib_iters, - calib_padding=calib_padding, - calib_len=calib_len, - calib_pad_val=calib_pad_val, - op_name_dict=op_name_dict, - op_type_dict=op_type_dict, - excluded_precisions=excluded_precisions, - example_inputs=example_inputs, - ) self.quant_method = QuantizationMethod.SmoothQuant - self.ipex_opt_llm = ipex_opt_llm + self.dataset = dataset + self.tokenizer=tokenizer self.alpha = alpha + self.scale_sharing = scale_sharing + self.init_alpha = init_alpha + self.alpha_min = alpha_min + self.alpha_max = alpha_max + self.alpha_step = alpha_step + self.shared_criterion = shared_criterion + self.do_blockwise = do_blockwise + self.auto_alpha_args = auto_alpha_args + self.nsamples=nsamples + self.ipex_opt_llm = ipex_opt_llm self.num_beams = num_beams - self.recipes = recipes + self.excluded_precisions = excluded_precisions class RtnConfig(ITREXQuantizationConfigMixin): def __init__( diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index 1086ea000a4..00357aeacb3 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -330,159 +330,159 @@ def test_quantization_for_llm(self): fp32_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, use_neural_speed=False) dummy_input = fp32_model.dummy_inputs["input_ids"] - # Dynamic quant - dq_config = DynamicQuantConfig() - q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=dq_config, - ) - q_model.eval() - output = q_model(dummy_input) - q_model.save_pretrained("./saved_results") - output = q_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) - q_model = AutoModelForCausalLM.from_pretrained("./saved_results" - ) - output = q_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) - # Static quant - sq_config = StaticQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - calib_iters=2, - ) - q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=sq_config, - ) - q_model.eval() - output = q_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) - q_model.save_pretrained("./saved_results") - loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") - loading_model.eval() - output = loading_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) - # Quant aware training - qat_config = QuantAwareTrainingConfig( - tokenizer=tokenizer, # either two of one, tokenizer or train_func - train_iters=2, - ) - q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=qat_config, - ) - q_model.eval() - output = q_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) - q_model.save_pretrained("./saved_results") - loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") - loading_model.eval() - output = loading_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) - # Smoothquant - sq_config = SmoothQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - calib_iters=2, - ipex_opt_llm=False - ) - q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=sq_config, - use_neural_speed=False - ) - self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) - - # Smoothquant auto - recipes = { - "smooth_quant": True, - "smooth_quant_args": { "alpha": "auto", "auto_alpha_args":{"alpha_max": 0.6, - "alpha_min":0.5, "alpha_step":0.1, "shared_criterion": "mean", "do_blockwise": False}}, - } - sq_config = SmoothQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - calib_iters=2, - recipes=recipes, - ipex_opt_llm=False - ) - q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=sq_config, - use_neural_speed=False - ) - self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) - - # weight-only - # RTN - woq_config = RtnConfig(bits=4) - woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=woq_config, - use_neural_speed=False - ) - woq_model.eval() - output = woq_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047, rel_tol=1e-04)) - - # AWQ - woq_config = AwqConfig(bits=4, - zero_point=False, - calib_iters=5, - tokenizer=tokenizer - ) - - woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=woq_config, - use_neural_speed=False - ) - woq_model.eval() - output = woq_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.18019595742225647 , rel_tol=1e-04)) - - # TEQ - woq_config = TeqConfig(bits=4, - calib_iters=5, - tokenizer=tokenizer, - ) - woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=woq_config, - use_neural_speed=False - ) - woq_model.eval() - output = woq_model(dummy_input) - - # fp8 - woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") - woq_model = AutoModelForCausalLM.from_pretrained( - model_name_or_path, quantization_config=woq_config, use_neural_speed=False - ) - woq_model.eval() - output = woq_model(dummy_input) - self.assertTrue( - isclose(float(output[0][0][0][0]), 0.16162332892417908, rel_tol=1e-04) - ) - - # amp - amp_config = MixedPrecisionConfig() - amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=amp_config, - use_neural_speed=False - ) - amp_model.eval() - output = amp_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) - - # load_in_4bit - bit4_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - load_in_4bit=True, - use_neural_speed=False - ) - bit4_model.eval() - output = bit4_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.18726778030395508, rel_tol=1e-04)) - - # load_in_8bit - bit8_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - load_in_8bit=True, - use_neural_speed=False, - device_map="cpu" - ) - bit8_model.eval() - output = bit8_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.16759155690670013, rel_tol=1e-04)) + # # Dynamic quant + # dq_config = DynamicQuantConfig() + # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=dq_config, + # ) + # q_model.eval() + # output = q_model(dummy_input) + # q_model.save_pretrained("./saved_results") + # output = q_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) + # q_model = AutoModelForCausalLM.from_pretrained("./saved_results" + # ) + # output = q_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) + # # Static quant + # sq_config = StaticQuantConfig( + # tokenizer=tokenizer, # either two of one, tokenizer or calib_func + # calib_iters=2, + # ) + # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=sq_config, + # ) + # q_model.eval() + # output = q_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) + # q_model.save_pretrained("./saved_results") + # loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") + # loading_model.eval() + # output = loading_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) + # # Quant aware training + # qat_config = QuantAwareTrainingConfig( + # tokenizer=tokenizer, # either two of one, tokenizer or train_func + # train_iters=2, + # ) + # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=qat_config, + # ) + # q_model.eval() + # output = q_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) + # q_model.save_pretrained("./saved_results") + # loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") + # loading_model.eval() + # output = loading_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) + # # Smoothquant + # sq_config = SmoothQuantConfig( + # tokenizer=tokenizer, # either two of one, tokenizer or calib_func + # calib_iters=2, + # ipex_opt_llm=False + # ) + # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=sq_config, + # use_neural_speed=False + # ) + # self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) + + # # Smoothquant auto + # recipes = { + # "smooth_quant": True, + # "smooth_quant_args": { "alpha": "auto", "auto_alpha_args":{"alpha_max": 0.6, + # "alpha_min":0.5, "alpha_step":0.1, "shared_criterion": "mean", "do_blockwise": False}}, + # } + # sq_config = SmoothQuantConfig( + # tokenizer=tokenizer, # either two of one, tokenizer or calib_func + # calib_iters=2, + # recipes=recipes, + # ipex_opt_llm=False + # ) + # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=sq_config, + # use_neural_speed=False + # ) + # self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) + + # # weight-only + # # RTN + # woq_config = RtnConfig(bits=4) + # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=woq_config, + # use_neural_speed=False + # ) + # woq_model.eval() + # output = woq_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047, rel_tol=1e-04)) + + # # AWQ + # woq_config = AwqConfig(bits=4, + # zero_point=False, + # calib_iters=5, + # tokenizer=tokenizer + # ) + + # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=woq_config, + # use_neural_speed=False + # ) + # woq_model.eval() + # output = woq_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.18019595742225647 , rel_tol=1e-04)) + + # # TEQ + # woq_config = TeqConfig(bits=4, + # calib_iters=5, + # tokenizer=tokenizer, + # ) + # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=woq_config, + # use_neural_speed=False + # ) + # woq_model.eval() + # output = woq_model(dummy_input) + + # # fp8 + # woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") + # woq_model = AutoModelForCausalLM.from_pretrained( + # model_name_or_path, quantization_config=woq_config, use_neural_speed=False + # ) + # woq_model.eval() + # output = woq_model(dummy_input) + # self.assertTrue( + # isclose(float(output[0][0][0][0]), 0.16162332892417908, rel_tol=1e-04) + # ) + + # # amp + # amp_config = MixedPrecisionConfig() + # amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=amp_config, + # use_neural_speed=False + # ) + # amp_model.eval() + # output = amp_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) + + # # load_in_4bit + # bit4_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # load_in_4bit=True, + # use_neural_speed=False + # ) + # bit4_model.eval() + # output = bit4_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.18726778030395508, rel_tol=1e-04)) + + # # load_in_8bit + # bit8_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # load_in_8bit=True, + # use_neural_speed=False, + # device_map="cpu" + # ) + # bit8_model.eval() + # output = bit8_model(dummy_input) + # self.assertTrue(isclose(float(output[0][0][0][0]), 0.16759155690670013, rel_tol=1e-04)) # GPTQ woq_config = GPTQConfig(bits=4, @@ -494,6 +494,7 @@ def test_quantization_for_llm(self): nsamples=3, max_input_length=256, tokenizer=tokenizer, + batch_size=1 ) woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, @@ -501,6 +502,7 @@ def test_quantization_for_llm(self): ) woq_model.eval() output = woq_model(dummy_input) + import pdb;pdb.set_trace(); self.assertTrue(isclose(float(output[0][0][0][0]), 0.17126554250717163, rel_tol=1e-04)) # AUTOROUND From c66f9847f217f0184f451ee74414bf72d2e130d8 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 22 May 2024 02:56:05 -0700 Subject: [PATCH 05/28] ssupport moothquant with fix alpha Signed-off-by: changwangss --- .../text-generation/quantization/README.md | 1 - .../quantization/llm_quantization_recipes.md | 40 +++--- .../quantization/run_generation_sq.py | 46 +++---- .../transformers/llm/evaluation/models.py | 1 + .../transformers/llm/quantization/sq_utils.py | 130 +++++++++++++++++- .../transformers/llm/quantization/utils.py | 19 +-- .../transformers/modeling/modeling_auto.py | 20 ++- .../transformers/utils/config.py | 7 +- tests/CI/test_quantization.py | 1 - 9 files changed, 191 insertions(+), 74 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md index 6fe9d66aa70..a61a2d84df8 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/README.md +++ b/examples/huggingface/pytorch/text-generation/quantization/README.md @@ -36,7 +36,6 @@ OMP_NUM_THREADS= numactl -m -C python ru --model \ --sq \ --output_dir \ # Default is "./saved_results." - --int8 \ --benchmark \ --batch_size 1 # load SQ model quantied by itrex and do benchmark. diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md index 0af551db7a2..bedbb24a5f7 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md +++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md @@ -56,10 +56,9 @@ pip install intel-extension-for-pytorch==2.3.0 python run_generation_sq.py \ --model EleutherAI/gpt-j-6b \ --output_dir ./saved_results \ - --trust_remote_code \ - --fallback_add \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq \ + --accuracy \ --batch_size 1 \ --alpha 0.85 ``` @@ -111,9 +110,8 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model facebook/opt-1.3b \ --output_dir ./saved_results \ - --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.9 ``` @@ -166,7 +164,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.5 ``` @@ -219,10 +217,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --calib_len 2048 \ - --fallback_add \ --calib_shuffle False \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.8, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'mean'}}}" ``` @@ -275,10 +272,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --calib_len 1024 \ - --fallback_add \ --calib_padding \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.75, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'max'}}}" ``` @@ -331,7 +327,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.8 ``` @@ -384,7 +380,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.9 ``` @@ -437,7 +433,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.95 ``` @@ -489,7 +485,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.95 ``` @@ -542,7 +538,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.65 ``` @@ -595,7 +591,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.5 ``` @@ -649,7 +645,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.75 ``` @@ -701,7 +697,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.9 ``` @@ -754,7 +750,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.6 ``` @@ -807,7 +803,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.7 ``` @@ -859,7 +855,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.75 ``` @@ -912,7 +908,7 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --int8 --sq --accuracy \ + --sq --accuracy \ --batch_size 1 \ --alpha 0.75 ``` diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index 39ec9e46353..f26d74eff05 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -59,7 +59,7 @@ # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") -parser.add_argument("--nsamples", default=100, help="Smooth quant calibration samples.") +parser.add_argument("--n_samples", default=100, help="Smooth quant calibration samples.") # sq alpha "auto" parameters parser.add_argument("--scale_sharing", action="store_true") parser.add_argument("--init_alpha", default="0.5", help="Smooth quant parameter.") @@ -90,10 +90,8 @@ args.model, torchscript=( True - if ( + if args.sq - or (args.int8 or args.int8_bf16_mixed) - ) else False ), # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. @@ -149,45 +147,39 @@ quantization_config=quantization_config, trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, - use_neural_speed=False ) # save model if args.output_dir is not None and (args.sq or args.mixed_precision): tokenizer.save_pretrained(args.output_dir) if args.sq: + quantization_config.remove_redundant_parameters() + config.quantization_config = quantization_config config.save_pretrained(args.output_dir) torch.jit.save(user_model, args.output_dir + "/pytorch_model.bin") + #validate loading + user_model = AutoModelForCausalLM.from_pretrained( + args.output_dir, + trust_remote_code=args.trust_remote_code, + _commit_hash=args._commit_hash, + ) elif args.mixed_precision: user_model.save_pretrained(args.output_dir) - args.model = args.output_dir -if args.int8 or args.int8_bf16_mixed: - print("Loading SmoothQuant model from: ", args.model) - import intel_extension_for_pytorch as ipex - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, +if args.restore: + from intel_extension_for_transformers.transformers.utils.utility import ( + recover_model_from_json, + ) + user_model = recover_model_from_json( + args.model, + os.path.join(args.output_dir, "best_configure.json"), + args.trust_remote_code, ) - if args.restore: - from intel_extension_for_transformers.transformers.utils.utility import ( - recover_model_from_json, - ) - user_model = recover_model_from_json( - args.model, - os.path.join(args.output_dir, "best_configure.json"), - args.trust_remote_code, - ) - else: - user_model = TSModelCausalLMForITREX.from_pretrained( - args.model, - file_name="best_model.pt", - trust_remote_code=args.trust_remote_code, - ) + elif not (args.sq or args.mixed_precision): user_model = AutoModelForCausalLM.from_pretrained( args.model, trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, - use_neural_speed=False ) diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/models.py b/intel_extension_for_transformers/transformers/llm/evaluation/models.py index c95bd1f462f..ce5127b71ec 100644 --- a/intel_extension_for_transformers/transformers/llm/evaluation/models.py +++ b/intel_extension_for_transformers/transformers/llm/evaluation/models.py @@ -164,6 +164,7 @@ def forward( "attention_mask": attention_mask, } input_bs, input_len = input_ids.shape + import pdb;pdb.set_trace(); if self.use_cache and past_key_values is None: if model_type in IPEX_OPT_LLM_SUPPORTED: if model_type == "llama" and transformers.__version__ >= "4.36": diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index 132f0b3c5dc..a782449f38d 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -21,10 +21,29 @@ from datasets import load_dataset from torch.utils.data import DataLoader from torch.nn.functional import pad - +import re +import transformers +from typing import Optional, Tuple +from transformers.modeling_outputs import CausalLMOutputWithPast +from optimum.intel.generation.modeling import TSModelForCausalLM +from intel_extension_for_transformers.tools.utils import is_ipex_available +if is_ipex_available(): + import intel_extension_for_pytorch as ipex torch = LazyImport("torch") + IPEX_OPT_LLM_SUPPORTED_DICT = {"2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], "2.3": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "bloom", "codegen", "gptbigcode", "t5", "mixtral", "mpt"]} +if is_ipex_available() and ipex.__version__ == "2.2.0+cpu": + logger.info("ipex.llm.optimize by 2.2.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])) + logger.info("The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] +elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu": + logger.info("ipex.llm.optimize by 2.3.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])) + logger.info("The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] +else: + logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") + IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] MODEL_TYPES_REQUIRING_POSITION_IDS = { "codegen", @@ -69,7 +88,6 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): new_shape = [input_bs * num_key_value_heads, d_k, 1] else: new_shape = [input_bs * num_key_value_heads, 1, d_k] - else: new_shape = [input_bs, num_key_value_heads, 1, d_k] @@ -220,4 +238,110 @@ def collate_batch(batch): shuffle=False, collate_fn=collate_batch, ) - return calib_dataloader \ No newline at end of file + return calib_dataloader + +class TSModelCausalLMForITREX(TSModelForCausalLM): + def _reorder_cache( + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + """This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. + + This is required to match `past_key_values` with the correct beam_idx at every generation step. + """ + if self.config.model_type == "bloom": + return self._reorder_cache_bloom(past_key_values, beam_idx) + if self.config.model_type == "chatglm": + return tuple( + tuple( + past_state.index_select(1, beam_idx.to(past_state.device)) + for past_state in layer_past + ) + for layer_past in past_key_values + ) + if len(past_key_values[0]) == 4: # discrete kv_cache + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + else: + return tuple( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ) + for layer_past in past_key_values + ) + + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + past_key_values = past_key_values or kwargs.get("past", None) + + if self.use_cache and past_key_values is not None: + input_ids = input_ids[:, -1:] + + # `past_key_values` may be in the standard format (e.g. in contrastive search), + # converts to bloom's format if needed + if past_key_values is not None and self.config.model_type == "bloom": + if past_key_values[0][0].shape[0] == input_ids.shape[0]: + past_key_values = self._convert_to_bloom_cache(past_key_values) + position_ids = kwargs.get("position_ids", None) + + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": self.use_cache, + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": None, + } + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + position_ids: Optional[torch.FloatTensor] = None, + **kwargs, + ) -> CausalLMOutputWithPast: + model_type = self.config.model_type.replace("_", "-") + inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + input_bs, input_len = input_ids.shape + + if self.use_cache and past_key_values is None: + if model_type in IPEX_OPT_LLM_SUPPORTED: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=self.config, input_bs=input_bs, num_beams=1 + ) + else: + past_key_values = generate_dummy_past_key_values( + config=self.config, input_bs=input_bs + ) + inputs["past_key_values"] = past_key_values + if attention_mask is None: + inputs["attention_mask"] = torch.ones_like(input_ids) + + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + if position_ids is not None: + inputs["position_ids"] = position_ids + else: + inputs["position_ids"] = torch.arange(input_len).repeat(input_bs, 1) + outputs = self.model(**inputs) + + if isinstance(outputs, (list, tuple)): + logits = outputs[0] + past_key_values = outputs[1] if self.use_cache else None + else: + logits = outputs["logits"] + past_key_values = outputs["past_key_values"] if self.use_cache else None + return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index a3763920971..0bb6f59432d 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -40,7 +40,7 @@ quantize ) from .sq_utils import ( - IPEX_OPT_LLM_SUPPORTED_DICT, + IPEX_OPT_LLM_SUPPORTED, MODEL_TYPES_REQUIRING_POSITION_IDS, generate_dummy_past_key_values_for_opt_llm, generate_dummy_past_key_values, @@ -653,17 +653,6 @@ def get_bits(config): return bits def convert_to_smoothquant_model(model, quantization_config): - if ipex.__version__ == "2.2.0+cpu": - logger.info("ipex.llm.optimize by 2.2.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])) - logger.info("The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version.") - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] - elif ipex.__version__ == "2.3.0+cpu": - logger.info("ipex.llm.optimize by 2.3.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])) - logger.info("The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version.") - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] - else: - logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] model_type = model.config.model_type.replace("_", "-") # ipex.optimize_transformers if quantization_config.ipex_opt_llm is None: @@ -699,11 +688,11 @@ def convert_to_smoothquant_model(model, quantization_config): calib_dataloader = get_dataloader(model_type, quantization_config, past_key_values=past_key_values, shuffle=True, padding=True, max_input_lenth=2048, pad_val=1) else: calib_dataloader = get_dataloader(model_type, quantization_config, past_key_values=past_key_values) - + def calib_func(model): with torch.no_grad(): for i, (inputs, last_ind) in enumerate(calib_dataloader): - if i >= quantization_config.nsamples: + if i >= quantization_config.n_samples: break if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: model( @@ -754,4 +743,4 @@ def calib_func(model): q_model = torch.jit.freeze(q_model.eval()) q_model(**example_inputs) q_model(**example_inputs) - return q_model \ No newline at end of file + return q_model diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index fae551a838e..b65de03bed1 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -413,7 +413,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) - if kwargs.get("use_llm_runtime", None) is not None: + if quantization_config is not None and quantization_config.quant_method in ["sq"]: + use_neural_speed = False + elif hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and "quant_method" in config.quantization_config and config.quantization_config["quant_method"] in ["sq"]: + use_neural_speed = False + elif kwargs.get("use_llm_runtime", None) is not None: use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu logger.warning( "use_llm_runtime is deprecated in version 1.3.2, please use_neural_speed instead." @@ -1226,6 +1230,8 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config = DynamicQuantConfig.from_dict(quantization_config) elif quantization_config["quant_method"] == "qat": quantization_config = QuantAwareTrainingConfig.from_dict(quantization_config) + elif quantization_config["quant_method"] == "sq": + quantization_config = SmoothQuantConfig.from_dict(quantization_config) assert ( quantization_config is not None ), "Detect this model is not a low-bit model." @@ -1478,7 +1484,17 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): q_model = load(weights_file, model, dataloader=None) del model return q_model - + if quantization_config.quant_method in ["sq"]: + print("Loading SmoothQuant model from: ", pretrained_model_name_or_path) + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( + TSModelCausalLMForITREX, + ) + q_model = TSModelCausalLMForITREX.from_pretrained( + pretrained_model_name_or_path, + file_name=WEIGHTS_NAME, + trust_remote_code=trust_remote_code, + ) + return q_model dtype_orig = None if torch_dtype is not None: if isinstance(torch_dtype, str): diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index db45706ad37..65c478f7000 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -702,10 +702,11 @@ def __init__( shared_criterion = "max", do_blockwise = False, auto_alpha_args = None, - nsamples=100, + n_samples=100, excluded_precisions=[], ipex_opt_llm=None, - num_beams=1, + num_beams=1, + **kwargs, ): self.quant_method = QuantizationMethod.SmoothQuant @@ -720,7 +721,7 @@ def __init__( self.shared_criterion = shared_criterion self.do_blockwise = do_blockwise self.auto_alpha_args = auto_alpha_args - self.nsamples=nsamples + self.n_samples=n_samples self.ipex_opt_llm = ipex_opt_llm self.num_beams = num_beams self.excluded_precisions = excluded_precisions diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index 00357aeacb3..104f13e66b1 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -502,7 +502,6 @@ def test_quantization_for_llm(self): ) woq_model.eval() output = woq_model(dummy_input) - import pdb;pdb.set_trace(); self.assertTrue(isclose(float(output[0][0][0][0]), 0.17126554250717163, rel_tol=1e-04)) # AUTOROUND From 0d064a08ef4a1df26aa60c3493daeff2d88e6f71 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 22 May 2024 23:43:01 -0700 Subject: [PATCH 06/28] migrate restore sq from json Signed-off-by: changwangss --- .../text-generation/quantization/README.md | 9 +- .../quantization/llm_quantization_recipes.md | 94 +++--- .../quantization/run_benchmark.sh | 6 +- .../quantization/run_generation_sq.py | 101 ++++--- .../transformers/llm/evaluation/models.py | 204 ------------- .../transformers/llm/quantization/sq_utils.py | 234 ++++++++++++--- .../transformers/llm/quantization/utils.py | 276 +++++++++++------- .../transformers/utils/config.py | 208 ++++++++----- 8 files changed, 615 insertions(+), 517 deletions(-) delete mode 100644 intel_extension_for_transformers/transformers/llm/evaluation/models.py diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md index a61a2d84df8..9e0f5f94ed9 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/README.md +++ b/examples/huggingface/pytorch/text-generation/quantization/README.md @@ -41,15 +41,13 @@ OMP_NUM_THREADS= numactl -m -C python ru # load SQ model quantied by itrex and do benchmark. OMP_NUM_THREADS= numactl -m -C python run_generation_sq.py \ --model \ - --int8 \ --benchmark \ --batch_size 1 # load SQ model quantied configure.json and do benchmark. python run_generation_sq.py \ --model \ --output_dir \ - --int8 \ - --restore \ + --restore_sq_model_from_json \ --benchmark \ --batch_size 1 ``` @@ -67,14 +65,12 @@ python run_generation_sq.py \ --model \ --sq \ --output_dir \ # Default is "./saved_results." - --int8 \ --accuracy \ --batch_size 56 # load SQ model quantied by itrex and do benchmark. python run_generation_sq.py \ --model \ - --int8 \ --accuracy \ --batch_size 56 @@ -82,8 +78,7 @@ python run_generation_sq.py \ python run_generation_sq.py \ --model \ --output_dir \ - --int8 \ - --restore \ + --restore_sq_model_from_json \ --accuracy \ --batch_size 56 diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md index bedbb24a5f7..86e98ff8e57 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md +++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md @@ -59,7 +59,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 1 \ + --batch_size 56 \ --alpha 0.85 ``` @@ -111,8 +111,9 @@ python run_generation_sq.py \ --model facebook/opt-1.3b \ --output_dir ./saved_results \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.9 ``` @@ -164,8 +165,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.5 ``` @@ -215,13 +217,17 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model meta-llama/Llama-2-7b-hf \ --output_dir ./saved_results \ - --trust_remote_code \ - --calib_len 2048 \ - --calib_shuffle False \ --tasks lambada_openai \ - --sq --accuracy \ + --sq \ + --accuracy \ --batch_size 1 \ - --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.8, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'mean'}}}" + --init_alpha 0.8 \ + --alpha_min 0.8 \ + --alpha_max 0.99 \ + --alpha_step 0.01 \ + --shared_criterion mean \ + --seq_len 2048 \ + --alpha auto ``` ### Weight-Only Quantization @@ -270,13 +276,17 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model meta-llama/Llama-2-13b-hf \ --output_dir ./saved_results \ - --trust_remote_code \ - --calib_len 1024 \ - --calib_padding \ + --seq_len 1024 \ --tasks lambada_openai \ - --sq --accuracy \ + --sq \ + --accuracy \ --batch_size 1 \ - --recipes "{'smooth_quant': True, 'smooth_quant_args': {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, 'auto_alpha_args': {'alpha_min': 0.75, 'alpha_max': 0.99, 'alpha_step': 0.01, 'shared_criterion': 'max'}}}" + --init_alpha 0.8 \ + --alpha_min 0.75 \ + --alpha_max 0.99 \ + --alpha_step 0.01 \ + --shared_criterion max \ + --alpha auto ``` ### Weight-Only Quantization @@ -325,10 +335,10 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model meta-llama/Llama-2-70b-hf \ --output_dir ./saved_results \ - --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.8 ``` @@ -380,8 +390,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.9 ``` @@ -485,8 +496,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.95 ``` @@ -538,8 +550,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.65 ``` @@ -645,8 +658,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.75 ``` @@ -697,8 +711,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.9 ``` @@ -748,10 +763,10 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model bigscience/bloom-1b7 \ --output_dir ./saved_results \ - --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.6 ``` @@ -801,10 +816,10 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model EleutherAI/gpt-neox-20b \ --output_dir ./saved_results \ - --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.7 ``` @@ -855,8 +870,9 @@ python run_generation_sq.py \ --output_dir ./saved_results \ --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.75 ``` @@ -906,10 +922,10 @@ python run_generation_cpu_woq.py \ python run_generation_sq.py \ --model databricks/dolly-v2-12b \ --output_dir ./saved_results \ - --trust_remote_code \ --tasks lambada_openai \ - --sq --accuracy \ - --batch_size 1 \ + --sq \ + --accuracy \ + --batch_size 56 \ --alpha 0.75 ``` diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index 085e3da3574..c8a27fb0375 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -249,9 +249,9 @@ function run_benchmark { fi fi if [[ ${int8} == "true" ]] && [[ "$model_source" != "huggingface" ]]; then - if [[ "${script}" == "run_generation_sq.py" ]] && [[ "${topology}" != "gpt_j_mp" ]];then - extra_cmd=$extra_cmd" --int8" - fi + # if [[ "${script}" == "run_generation_sq.py" ]] && [[ "${topology}" != "gpt_j_mp" ]];then + # extra_cmd=$extra_cmd" --int8" + # fi model_name_or_path=$tuned_checkpoint fi if [[ $backend == "neuralspeed" ]]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index f26d74eff05..b32655bac12 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -1,21 +1,21 @@ import argparse +import json import os import re import time -import json + import torch +from optimum.intel.generation.modeling import TSModelForCausalLM from transformers import AutoConfig, AutoTokenizer -from intel_extension_for_transformers.transformers import ( - AutoModelForCausalLM, - AutoModel, -) from transformers.utils import check_min_version -from intel_extension_for_transformers.transformers.utils import str2bool -from optimum.intel.generation.modeling import TSModelForCausalLM + from intel_extension_for_transformers.transformers import ( + AutoModel, + AutoModelForCausalLM, MixedPrecisionConfig, SmoothQuantConfig, ) +from intel_extension_for_transformers.transformers.utils import str2bool parser = argparse.ArgumentParser() parser.add_argument("--model", default=None) @@ -34,7 +34,7 @@ help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", ) parser.add_argument( - "--restore", + "--restore_sq_model_from_json", action="store_true", help="restore ipex quantized model from output_dir/best_configure.json", ) @@ -59,13 +59,26 @@ # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") parser.add_argument("--alpha", default=0.5, help="Smooth quant parameter.") -parser.add_argument("--n_samples", default=100, help="Smooth quant calibration samples.") +parser.add_argument( + "--n_samples", default=100, type=int, help="Smooth quant calibration samples." +) +parser.add_argument( + "--seq_len", default=512, type=int, help="Smooth quant calibration input length." +) # sq alpha "auto" parameters parser.add_argument("--scale_sharing", action="store_true") -parser.add_argument("--init_alpha", default="0.5", help="Smooth quant parameter.") -parser.add_argument("--alpha_min", default="0.0", help="Smooth quant parameter.") -parser.add_argument("--alpha_max", default="1.0", help="Smooth quant parameter.") -parser.add_argument("--alpha_step", default="0.1", help="Smooth quant parameter.") +parser.add_argument( + "--init_alpha", default=0.5, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_min", default=0.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_max", default=1.0, type=float, help="Smooth quant parameter." +) +parser.add_argument( + "--alpha_step", default=0.1, type=float, help="Smooth quant parameter." +) parser.add_argument("--shared_criterion", default="max", type=str) parser.add_argument("--do_blockwise", action="store_true") # ============AutoModel parameters============== @@ -89,10 +102,7 @@ config = AutoConfig.from_pretrained( args.model, torchscript=( - True - if - args.sq - else False + True if args.sq else False ), # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. trust_remote_code=args.trust_remote_code, @@ -123,18 +133,20 @@ if args.mixed_precision: quantization_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 elif args.sq: - excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + excluded_precisions = ["bf16"] quantization_config = SmoothQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - excluded_precisions=excluded_precisions, # default is [] - alpha = args.alpha, - scale_sharing = args.scale_sharing, + tokenizer=tokenizer, + seq_len=args.seq_len, + n_samples=args.n_samples, + excluded_precisions=excluded_precisions, + alpha=args.alpha if args.alpha == "auto" else float(args.alpha), + scale_sharing=args.scale_sharing, init_alpha=args.init_alpha, alpha_min=args.alpha_min, alpha_max=args.alpha_max, alpha_step=args.alpha_step, shared_criterion=args.shared_criterion, - do_blockwise = args.do_blockwise, + do_blockwise=args.do_blockwise, num_beams=generate_kwargs["num_beams"], ) else: @@ -156,7 +168,9 @@ config.quantization_config = quantization_config config.save_pretrained(args.output_dir) torch.jit.save(user_model, args.output_dir + "/pytorch_model.bin") - #validate loading + with open(args.output_dir + "/best_configure.json", "w") as f: + json.dump(user_model.tune_cfg, f, indent=4) + # validate loading user_model = AutoModelForCausalLM.from_pretrained( args.output_dir, trust_remote_code=args.trust_remote_code, @@ -165,8 +179,8 @@ elif args.mixed_precision: user_model.save_pretrained(args.output_dir) -if args.restore: - from intel_extension_for_transformers.transformers.utils.utility import ( +if args.restore_sq_model_from_json: + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( recover_model_from_json, ) user_model = recover_model_from_json( @@ -183,7 +197,6 @@ ) - if args.benchmark: user_model = user_model.eval() if hasattr(user_model, "eval") else user_model prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun." @@ -242,18 +255,32 @@ if args.accuracy: - args.model = (peft_config.base_model_name_or_path if args.peft_model_id else args.model) + args.model = ( + peft_config.base_model_name_or_path if args.peft_model_id else args.model + ) + + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import ( + LMEvalParser, + evaluate, + ) - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser - args = LMEvalParser(model = "hf", - tokenizer = tokenizer, - user_model = user_model, - tasks = args.tasks, - device = "cpu", - batch_size = args.batch_size) + args = LMEvalParser( + model="hf", + tokenizer=tokenizer, + user_model=user_model, + tasks=args.tasks, + device="cpu", + batch_size=args.batch_size, + ) results = evaluate(args) for task_name in args.tasks.split(","): if task_name == "wikitext": - print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"])) + print( + "Accuracy for %s is: %s" + % (task_name, results["results"][task_name]["word_perplexity,none"]) + ) else: - print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"])) + print( + "Accuracy for %s is: %s" + % (task_name, results["results"][task_name]["acc,none"]) + ) diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/models.py b/intel_extension_for_transformers/transformers/llm/evaluation/models.py deleted file mode 100644 index ce5127b71ec..00000000000 --- a/intel_extension_for_transformers/transformers/llm/evaluation/models.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import torch -import transformers -from typing import Optional, Tuple -from transformers.modeling_outputs import CausalLMOutputWithPast -from optimum.intel.generation.modeling import TSModelForCausalLM -from intel_extension_for_transformers.transformers.utils.utility import ( - generate_dummy_past_key_values_for_inference, - generate_dummy_past_key_values_for_opt_llm, - MODEL_TYPES_REQUIRING_POSITION_IDS, - IPEX_OPT_LLM_SUPPORTED, -) - - -class TSModelCausalLMForITREX(TSModelForCausalLM): - def _reorder_cache( - self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor - ) -> Tuple[Tuple[torch.Tensor]]: - """This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or - [`~PreTrainedModel.beam_sample`] is called. - - This is required to match `past_key_values` with the correct beam_idx at every generation step. - """ - if self.config.model_type == "bloom": - return self._reorder_cache_bloom(past_key_values, beam_idx) - if self.config.model_type == "chatglm": - return tuple( - tuple( - past_state.index_select(1, beam_idx.to(past_state.device)) - for past_state in layer_past - ) - for layer_past in past_key_values - ) - if len(past_key_values[0]) == 4: # discrete kv_cache - for layer_past in past_key_values: - layer_past[3][layer_past[0].size(-2) - 1] = beam_idx - return past_key_values - else: - return tuple( - tuple( - past_state.index_select(0, beam_idx.to(past_state.device)) - for past_state in layer_past - ) - for layer_past in past_key_values - ) - - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - past_key_values = past_key_values or kwargs.get("past", None) - - if self.use_cache and past_key_values is not None: - if not ( - self.config.model_type == "chatglm" - and re.search("THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]) - ): - input_ids = input_ids[:, -1:] - - # `past_key_values` may be in the standard format (e.g. in contrastive search), - # converts to bloom's format if needed - if past_key_values is not None and self.config.model_type == "bloom": - if past_key_values[0][0].shape[0] == input_ids.shape[0]: - past_key_values = self._convert_to_bloom_cache(past_key_values) - position_ids = kwargs.get("position_ids", None) - - attention_mask = kwargs.get("attention_mask", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) - - if self.config.model_type == "chatglm" and re.search( - "THUDM/chatglm-6b", self.config.auto_map["AutoConfig"] - ): - MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id - seqs = input_ids.tolist() - mask_positions, use_gmasks = [], [] - for seq in seqs: - mask_token = gMASK if gMASK in seq else MASK - use_gmask = mask_token == gMASK - mask_positions.append(seq.index(mask_token)) - use_gmasks.append(use_gmask) - batch_size, seq_length = input_ids.shape - device = input_ids.device - if past_key_values is None: - context_lengths = [ - seq.tolist().index(self.config.bos_token_id) for seq in input_ids - ] - position_ids = ( - torch.arange(seq_length, dtype=torch.long, device=device) - .unsqueeze(0) - .repeat(batch_size, 1) - ) - for i, context_length in enumerate(context_lengths): - position_ids[i, context_length:] = mask_positions[i] - block_position_ids = [ - torch.cat( - ( - torch.zeros( - context_length, dtype=torch.long, device=device - ), - torch.arange( - seq_length - context_length, - dtype=torch.long, - device=device, - ) - + 1, - ) - ) - for context_length in context_lengths - ] - block_position_ids = torch.stack(block_position_ids, dim=0) - position_ids = torch.stack((position_ids, block_position_ids), dim=1) - else: - context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs] - position_ids = torch.tensor( - [ - [mask_position, seq_length - context_length] - for mask_position, context_length in zip( - mask_positions, context_lengths - ) - ], - dtype=torch.long, - device=input_ids.device, - ).unsqueeze(-1) - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": self.use_cache, - "position_ids": position_ids, - "attention_mask": attention_mask, - "token_type_ids": None, - } - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - position_ids: Optional[torch.FloatTensor] = None, - **kwargs, - ) -> CausalLMOutputWithPast: - model_type = self.config.model_type.replace("_", "-") - inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - } - input_bs, input_len = input_ids.shape - import pdb;pdb.set_trace(); - if self.use_cache and past_key_values is None: - if model_type in IPEX_OPT_LLM_SUPPORTED: - if model_type == "llama" and transformers.__version__ >= "4.36": - past_key_values = generate_dummy_past_key_values_for_inference( - config=self.config, input_bs=input_bs - ) - else: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=self.config, input_bs=input_bs, num_beams=1 - ) - else: - past_key_values = generate_dummy_past_key_values_for_inference( - config=self.config, input_bs=input_bs - ) - inputs["past_key_values"] = past_key_values - if attention_mask is None: - inputs["attention_mask"] = torch.ones_like(input_ids) - if model_type == "chatglm": - if re.search("THUDM/chatglm-6b", self.config.auto_map["AutoConfig"]): - position_ids = self.prepare_inputs_for_generation(input_ids)[ - "position_ids" - ] - - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - if position_ids is not None: - inputs["position_ids"] = position_ids - else: - inputs["position_ids"] = torch.arange(input_len).repeat(input_bs, 1) - outputs = self.model(**inputs) - - if isinstance(outputs, (list, tuple)): - logits = outputs[0] - past_key_values = outputs[1] if self.use_cache else None - else: - logits = outputs["logits"] - past_key_values = outputs["past_key_values"] if self.use_cache else None - return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index a782449f38d..b914f89425c 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -14,32 +14,60 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...utils import ( - logger, - LazyImport, -) -from datasets import load_dataset -from torch.utils.data import DataLoader -from torch.nn.functional import pad import re -import transformers from typing import Optional, Tuple -from transformers.modeling_outputs import CausalLMOutputWithPast + +import transformers +from datasets import load_dataset from optimum.intel.generation.modeling import TSModelForCausalLM +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from transformers.modeling_outputs import CausalLMOutputWithPast + from intel_extension_for_transformers.tools.utils import is_ipex_available + +from ...utils import LazyImport, logger + if is_ipex_available(): import intel_extension_for_pytorch as ipex torch = LazyImport("torch") -IPEX_OPT_LLM_SUPPORTED_DICT = {"2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], - "2.3": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "bloom", "codegen", "gptbigcode", "t5", "mixtral", "mpt"]} +IPEX_OPT_LLM_SUPPORTED_DICT = { + "2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], + "2.3": [ + "gptj", + "opt", + "llama", + "falcon", + "chatglm", + "baichuan", + "bloom", + "codegen", + "gptbigcode", + "t5", + "mixtral", + "mpt", + ], +} if is_ipex_available() and ipex.__version__ == "2.2.0+cpu": - logger.info("ipex.llm.optimize by 2.2.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])) - logger.info("The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version.") + logger.info( + "ipex.llm.optimize by 2.2.0 version supported model family: {}".format( + ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]) + ) + ) + logger.info( + "The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version." + ) IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu": - logger.info("ipex.llm.optimize by 2.3.0 version supported model family: ", ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])) - logger.info("The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version.") + logger.info( + "ipex.llm.optimize by 2.3.0 version supported model family: {}".format( + ", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]) + ) + ) + logger.info( + "The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version." + ) IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] else: logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") @@ -56,18 +84,35 @@ "llama", "mistral", "chatglm", - "baichuan" + "baichuan", } + def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): """Generate the dummy past_key_values.""" from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": - new_shape = [input_bs, 1, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + new_shape = [ + input_bs, + 1, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] elif config.model_type == "baichuan": - new_shape = [input_bs, config.num_attention_heads, 1, normalized_config.hidden_size//config.num_attention_heads] + new_shape = [ + input_bs, + config.num_attention_heads, + 1, + config.hidden_size // config.num_attention_heads, + ] elif config.model_type == "chatglm": - new_shape = [1, input_bs, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + new_shape = [ + 1, + input_bs, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] else: normalized_config = NormalizedConfigManager.get_normalized_config_class( config.model_type @@ -105,15 +150,32 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): ] return tuple(past_key_values) + def generate_dummy_past_key_values(config, input_bs): """Generate the dummy past_key_values.""" from optimum.utils import NormalizedConfigManager + if config.model_type == "qwen": - new_shape = [input_bs, 1, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + new_shape = [ + input_bs, + 1, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] elif config.model_type == "baichuan": - new_shape = [input_bs, config.num_attention_heads, 1, normalized_config.hidden_size//config.num_attention_heads] + new_shape = [ + input_bs, + config.num_attention_heads, + 1, + config.hidden_size // config.num_attention_heads, + ] elif config.model_type == "chatglm": - new_shape = [1, input_bs, config.num_attention_heads, normalized_config.hidden_size//config.num_attention_heads] + new_shape = [ + 1, + input_bs, + config.num_attention_heads, + config.hidden_size // config.num_attention_heads, + ] else: normalized_config = NormalizedConfigManager.get_normalized_config_class( config.model_type @@ -157,15 +219,23 @@ def generate_dummy_past_key_values(config, input_bs): ] return tuple(past_key_values) -def get_dataloader(model_type, quantization_config, past_key_values, shuffle=False, padding=False, max_input_length=512, pad_val=None): + +def get_dataloader( + model_type, + quantization_config, + past_key_values, + shuffle=False, + padding=False, + seq_len=512, +): calib_dataset = load_dataset( - quantization_config.dataset, - split=( - "test" - if quantization_config.dataset in ["mbpp", "openai_humaneval"] - else "train" - ), -) + quantization_config.dataset, + split=( + "test" + if quantization_config.dataset in ["mbpp", "openai_humaneval"] + else "train" + ), + ) if shuffle: calib_dataset = calib_dataset.shuffle(seed=42) @@ -193,15 +263,13 @@ def collate_batch(batch): input_ids = text["input_ids"] if not padding: input_ids = ( - input_ids[: int(max_input_length)] - if len(input_ids) > int(max_input_length) + input_ids[: int(seq_len)] + if len(input_ids) > int(seq_len) else input_ids ) # no_padding else: - pad_len = max_input_length - input_ids.shape[0] - input_ids = pad( - input_ids, (0, pad_len), value=max_input_length - ) + pad_len = seq_len - input_ids.shape[0] + input_ids = pad(input_ids, (0, pad_len), value=seq_len) last_ind.append(input_ids.shape[0] - 1) attention_mask = torch.ones(len(input_ids)) @@ -240,6 +308,7 @@ def collate_batch(batch): ) return calib_dataloader + class TSModelCausalLMForITREX(TSModelForCausalLM): def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor @@ -345,3 +414,96 @@ def forward( logits = outputs["logits"] past_key_values = outputs["past_key_values"] if self.use_cache else None return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) + +def loading_configure_file(model, json_file_path, example_inputs): + """Recover ipex model from JSON file. + + Args: + model (object): fp32 model need to do quantization. + json_file_path (json): configuration JSON file for ipex. + example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function. + + Returns: + (object): quantized model + """ + + ipex = LazyImport("intel_extension_for_pytorch") + from torch.ao.quantization.observer import MinMaxObserver + + if ipex.__version__ >= "2.1.100": + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver) + else: + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver()) + if isinstance(example_inputs, dict): + model = ipex.quantization.prepare(model, qconfig, example_kwarg_inputs=example_inputs, inplace=True) + else: + model = ipex.quantization.prepare(model, qconfig, example_inputs=example_inputs, inplace=True) + model.load_qconf_summary(qconf_summary=json_file_path) + model = ipex.quantization.convert(model, inplace=True) + model.eval() + with torch.no_grad(): + model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False, check_trace=False) + model = torch.jit.freeze(model.eval()) + + model(**example_inputs) + model(**example_inputs) + return model + +def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remote_code=False): + """Recover ipex model from JSON file. + + Args: + model (object): fp32 model need to do quantization. + json_file_path (json): configuration JSON file for ipex saved. + trust_remote_code (bool): trust remote code. + + Returns: + (object): quantized model + """ + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, trust_remote_code=trust_remote_code) + if model.config.model_type in IPEX_OPT_LLM_SUPPORTED: + qconfig = ipex.quantization.default_static_qconfig_mapping + model = ipex.llm.optimize( + model.eval(), + dtype=torch.float, + inplace=True, + quantization_config=qconfig, + deployment_mode=False, + ) + # config + model.config.torchscript = True + config = model.config + + # example_inputs + + input_ids= model.dummy_inputs["input_ids"] + input_bs, input_len = input_ids.shape + attention_mask = torch.ones_like(input_ids) + position_ids = torch.arange(input_len).repeat(input_bs, 1) + num_beams = 1 + if config.model_type in IPEX_OPT_LLM_SUPPORTED: + past_key_values = generate_dummy_past_key_values_for_opt_llm( + config=config, input_bs=input_bs, num_beams=num_beams + ) + else: + past_key_values = generate_dummy_past_key_values( + config=config, input_bs=input_bs + ) + if config.model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + example_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values + } + else: + example_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values + } + + model = loading_configure_file(model, json_file_path, example_inputs) + model = TSModelCausalLMForITREX(model, config=config) + return model diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 0bb6f59432d..6c77b12eba0 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -16,46 +16,54 @@ # limitations under the License. -import logging import gc +import logging import math import os -from ...utils import CpuInfo + from accelerate import init_empty_weights from datasets import load_dataset from neural_compressor import quantization +from neural_compressor.torch.algorithms.weight_only.autoround import ( + get_autoround_default_run_fn, +) from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear -from neural_compressor.utils.utility import LazyImport -from neural_compressor.torch.algorithms.weight_only.autoround import get_autoround_default_run_fn from neural_compressor.torch.quantization import ( AutoRoundConfig, - RTNConfig, - GPTQConfig, AWQConfig, - TEQConfig, - SmoothQuantConfig, + GPTQConfig, HQQConfig, + RTNConfig, + SmoothQuantConfig, + TEQConfig, convert, prepare, - quantize + quantize, ) +from neural_compressor.utils.utility import LazyImport +from transformers import AutoTokenizer + +from intel_extension_for_transformers.tools.utils import ( + is_autoround_available, + is_ipex_available, +) + +from ...utils import CpuInfo from .sq_utils import ( IPEX_OPT_LLM_SUPPORTED, MODEL_TYPES_REQUIRING_POSITION_IDS, - generate_dummy_past_key_values_for_opt_llm, generate_dummy_past_key_values, - get_dataloader -) -from intel_extension_for_transformers.tools.utils import ( - is_ipex_available, - is_autoround_available, + generate_dummy_past_key_values_for_opt_llm, + get_dataloader, ) -from transformers import AutoTokenizer + if is_ipex_available(): import intel_extension_for_pytorch as ipex if is_autoround_available(): - from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woqlinear # pylint: disable=E0401 + from auto_round.export.export_to_itrex.model_wrapper import ( + WeightOnlyLinear as auto_round_woqlinear, + ) # pylint: disable=E0401 torch = LazyImport("torch") @@ -171,14 +179,15 @@ def _replace_linear( current_key_name = [] current_key_name.append(name) is_removed = False - use_optimum_format = getattr(module, "use_optimum_format", False) or \ - quantization_config.weight_dtype not in [ - "fp8_e5m2", - "fp8_e4m3", - "fp4", - "nf4", - "int4_fullrange", - ] + use_optimum_format = getattr( + module, "use_optimum_format", False + ) or quantization_config.weight_dtype not in [ + "fp8_e5m2", + "fp8_e4m3", + "fp4", + "nf4", + "int4_fullrange", + ] if ( isinstance(module, torch.nn.Linear) @@ -202,62 +211,75 @@ def _replace_linear( or device == "auto" ): if is_ipex_available() and quantization_config.use_ipex: - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear - from intel_extension_for_pytorch.utils.weight_only_quantization import \ - _convert_optimum_format_to_desired + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_linear, + ) + from intel_extension_for_pytorch.utils.weight_only_quantization import ( + _convert_optimum_format_to_desired, + ) - qweight, scales, qzeros = _convert_optimum_format_to_desired(module.qweight, - module.scales, - module.qzeros) + qweight, scales, qzeros = ( + _convert_optimum_format_to_desired( + module.qweight, module.scales, module.qzeros + ) + ) weight_dtype = { 4: ipex.quantization.WoqWeightDtype.INT4, 8: ipex.quantization.WoqWeightDtype.INT8, } compute_dtype = { - "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. + "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. "bf16": ipex.quantization.WoqLowpMode.BF16, "fp16": ipex.quantization.WoqLowpMode.FP16, "int8": ipex.quantization.WoqLowpMode.INT8, - } - ipex_qconfig_mapping = ( - ipex.quantization.get_weight_only_quant_qconfig_mapping( - weight_dtype=weight_dtype[quantization_config.bits], - lowp_mode=compute_dtype[quantization_config.compute_dtype], - act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, - group_size=quantization_config.group_size, - ) + ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype[quantization_config.bits], + lowp_mode=compute_dtype[ + quantization_config.compute_dtype + ], + act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + group_size=quantization_config.group_size, ) tmp_linear = torch.nn.Linear( in_features, out_features, - True if hasattr(module, "bias") else False - ) + True if hasattr(module, "bias") else False, + ) tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig - model._modules[name] = ipex_linear.from_float_and_int4_weight( - mod = tmp_linear, - qweight = qweight, - scales = scales, - zero_points = qzeros, - bias = module.bias if hasattr(module, "bias") else None, - group_size = quantization_config.group_size, - g_idx = module.g_idx if hasattr(module, "g_idx") else None, + model._modules[name] = ( + ipex_linear.from_float_and_int4_weight( + mod=tmp_linear, + qweight=qweight, + scales=scales, + zero_points=qzeros, + bias=( + module.bias if hasattr(module, "bias") else None + ), + group_size=quantization_config.group_size, + g_idx=( + module.g_idx + if hasattr(module, "g_idx") + else None + ), + ) ) else: from .nn.modules import ( QuantizedLinearQBits, ) # TODO: QuantizedLinearINT4, QuantizedLinearINT8 - use_optimum_format = getattr(module, "use_optimum_format", False) or \ - quantization_config.weight_dtype not in [ - "fp8_e5m2", - "fp8_e4m3", - "fp4", - "nf4", - "int4_fullrange", - ] + use_optimum_format = getattr( + module, "use_optimum_format", False + ) or quantization_config.weight_dtype not in [ + "fp8_e5m2", + "fp8_e4m3", + "fp4", + "nf4", + "int4_fullrange", + ] model._modules[name] = QuantizedLinearQBits( in_features, @@ -269,14 +291,18 @@ def _replace_linear( scale_dtype=quantization_config.scale_dtype, blocksize=quantization_config.group_size, scheme=quantization_config.scheme, - compression_dtype=getattr(module, "compression_dtype", torch.int32), + compression_dtype=getattr( + module, "compression_dtype", torch.int32 + ), compression_dim=getattr(module, "compression_dim", 1), device=device, use_optimum_format=use_optimum_format, ) elif device == "xpu" or device == torch.device("xpu"): - from intel_extension_for_pytorch.nn.utils._quantize_convert \ - import WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 + from intel_extension_for_pytorch.nn.utils._quantize_convert import ( + WeightOnlyQuantizedLinear as ipex_linear, + ) # pylint: disable=E0401 + model._modules[name] = ipex_linear( in_features, out_features, @@ -287,13 +313,21 @@ def _replace_linear( scale_dtype=quantization_config.scale_dtype, blocksize=quantization_config.group_size, scheme=quantization_config.scheme, - compression_dtype=getattr(module, "compression_dtype", torch.int8), + compression_dtype=getattr( + module, "compression_dtype", torch.int8 + ), compression_dim=getattr(module, "compression_dim", 0), device=device, - use_optimum_format=getattr(module, "use_optimum_format", False), + use_optimum_format=getattr( + module, "use_optimum_format", False + ), ) if quantization_config.quant_method.value == "gptq": - g_idx = getattr(module, "g_idx", torch.zeros(in_features, dtype=torch.int32).to(device)) + g_idx = getattr( + module, + "g_idx", + torch.zeros(in_features, dtype=torch.int32).to(device), + ) else: g_idx = None model._modules[name].set_scales_zps_gidx( @@ -330,7 +364,9 @@ def _replace_linear( model._modules[name].requires_grad_(False) if quantization_config.use_ipex: pass - elif (device == "cpu" or device == torch.device("cpu") or device == "auto"): + elif ( + device == "cpu" or device == torch.device("cpu") or device == "auto" + ): if quantization_config.weight_dtype in [ "fp8_e5m2", "fp8_e4m3", @@ -391,17 +427,16 @@ def _replace_linear( return model, is_replaced -def default_run_fn(model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="rtn"): - from datasets import load_dataset +def default_run_fn( + model, tokenizer, dataset, max_length=512, n_samples=100, batch_size=8, algo="rtn" +): from torch.utils.data import DataLoader if isinstance(dataset, (str, bytes, os.PathLike)): calib_dataset = load_dataset(dataset, split="train") calib_dataset = calib_dataset.shuffle(seed=42) if tokenizer is None: - logger.error( - "Please provide the tokenizer in quantization_config." - ) + logger.error("Please provide the tokenizer in quantization_config.") exit(0) def tokenize_function(examples): @@ -410,17 +445,23 @@ def tokenize_function(examples): tokenizer.pad_token = tokenizer.eos_token if "prompt" in examples: if algo == "teq": - example = tokenizer(examples["prompt"], padding="max_length", max_length=max_length) + example = tokenizer( + examples["prompt"], padding="max_length", max_length=max_length + ) else: example = tokenizer(examples["prompt"]) elif "code" in examples: if algo == "teq": - example = tokenizer(examples["code"], padding="max_length", max_length=max_length) + example = tokenizer( + examples["code"], padding="max_length", max_length=max_length + ) else: example = tokenizer(examples["code"]) elif "text" in examples: if algo == "teq": - example = tokenizer(examples["text"], padding="max_length", max_length=max_length) + example = tokenizer( + examples["text"], padding="max_length", max_length=max_length + ) else: example = tokenizer(examples["text"]) else: @@ -467,6 +508,7 @@ def collate_batch(batch): except ValueError: pass + def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): import intel_extension_for_pytorch @@ -495,42 +537,38 @@ def convert_to_quantized_model(model, config, device="cpu"): # mapping to INC config if config.quant_method.value == "rtn": export_compressed_model = False - if (device == "cpu" or device == torch.device("cpu")) \ - and config.weight_dtype not in ["nf4", "fp4", "int4_fullrange"]: + if ( + device == "cpu" or device == torch.device("cpu") + ) and config.weight_dtype not in ["nf4", "fp4", "int4_fullrange"]: export_compressed_model = True quant_config = RTNConfig( - dtype=config.weight_dtype, - bits=config.bits, - use_sym=config.sym, - group_size=config.group_size, - group_dim=config.group_dim, - use_full_range=config.use_full_range, - use_mse_search=config.mse_range, - export_compressed_model=export_compressed_model, - use_layer_wise=config.layer_wise, - model_path=config.model_path, - use_double_quant=config.use_double_quant, - double_quant_dtype=config.double_quant_dtype, - double_quant_bits=config.double_quant_bits, - double_quant_use_sym=config.double_quant_use_sym, - double_quant_group_size=config.double_quant_group_size, + dtype=config.weight_dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + group_dim=config.group_dim, + use_full_range=config.use_full_range, + use_mse_search=config.mse_range, + export_compressed_model=export_compressed_model, + use_layer_wise=config.layer_wise, + model_path=config.model_path, + use_double_quant=config.use_double_quant, + double_quant_dtype=config.double_quant_dtype, + double_quant_bits=config.double_quant_bits, + double_quant_use_sym=config.double_quant_use_sym, + double_quant_group_size=config.double_quant_group_size, ) model = prepare(model, quant_config) model = convert(model) elif config.quant_method.value == "hqq": - quant_config = HQQConfig( - - ) + quant_config = HQQConfig() elif config.quant_method.value == "awq": - quant_config = AWQConfig( - - ) + quant_config = AWQConfig() elif config.quant_method.value == "teq": quant_config = TEQConfig( dtype=dtype, bits=config.bits, use_sym=config.sym, - ) elif config.quant_method.value == "gptq": quant_config = GPTQConfig( @@ -552,10 +590,10 @@ def convert_to_quantized_model(model, config, device="cpu"): run_args = ( config.tokenizer, config.dataset, - config.max_input_length, # max_length - config.nsamples, # n_samples - config.batch_size, # batch_size - config.quant_method.value # algo + config.max_input_length, # max_length + config.nsamples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo ) model = prepare(model=model, quant_config=quant_config) run_fn(model, *run_args) @@ -585,7 +623,7 @@ def convert_to_quantized_model(model, config, device="cpu"): quant_config.seqlen, quant_config.seed, quant_config.batch_size, - "train" + "train", ) model = prepare(model=model, quant_config=quant_config) run_fn(model, *run_args) @@ -652,6 +690,7 @@ def get_bits(config): ) return bits + def convert_to_smoothquant_model(model, quantization_config): model_type = model.config.model_type.replace("_", "-") # ipex.optimize_transformers @@ -685,9 +724,18 @@ def convert_to_smoothquant_model(model, quantization_config): ) # get calibration dataloader if quantization_config.alpha == "auto" and model_type == "llama": - calib_dataloader = get_dataloader(model_type, quantization_config, past_key_values=past_key_values, shuffle=True, padding=True, max_input_lenth=2048, pad_val=1) + calib_dataloader = get_dataloader( + model_type, + quantization_config, + past_key_values=past_key_values, + shuffle=True, + padding=True, + seq_len=quantization_config.seq_len, + ) else: - calib_dataloader = get_dataloader(model_type, quantization_config, past_key_values=past_key_values) + calib_dataloader = get_dataloader( + model_type, quantization_config, past_key_values=past_key_values + ) def calib_func(model): with torch.no_grad(): @@ -732,15 +780,17 @@ def calib_func(model): alpha_step=quantization_config.alpha_step, shared_criterion=quantization_config.shared_criterion, do_blockwise=quantization_config.do_blockwise, - ) # fallback if model_type in ["gptj", "gpt_neox", "mpt"]: - quant_config = quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) - q_model = quantize(model, quant_config=quant_config, run_fn=calib_func, example_inputs=example_inputs) - with torch.no_grad(): - q_model = torch.jit.trace(q_model.eval(), example_kwarg_inputs=example_inputs, strict=False, check_trace=False) - q_model = torch.jit.freeze(q_model.eval()) - q_model(**example_inputs) - q_model(**example_inputs) - return q_model + quant_config = quant_config.set_local( + torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32") + ) + model = quantize( + model, + quant_config=quant_config, + run_fn=calib_func, + example_inputs=example_inputs, + ) + + return model diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 65c478f7000..75e14004c66 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -21,10 +21,12 @@ import os from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple, Union -from .utility import QUANT_CONFIG, SPARSITY_CONFIG, LazyImport, logger + import transformers from transformers import BitsAndBytesConfig, PretrainedConfig +from .utility import QUANT_CONFIG, SPARSITY_CONFIG, LazyImport, logger + torch = LazyImport("torch") @@ -32,8 +34,10 @@ class MixedPrecisionConfig: dtype: str = "bfloat16" + if transformers.__version__ >= "4.32.0": from transformers.utils.quantization_config import QuantizationConfigMixin + QuantizationConfig = QuantizationConfigMixin else: QuantizationConfig = PretrainedConfig @@ -54,7 +58,6 @@ class QuantizationMethod(str, Enum): QuantAwareTraining = "qat" - class SparsityConfig(PretrainedConfig): def __init__( self, @@ -237,6 +240,7 @@ def get_config_dict( pretrained_model_name_or_path, _configuration_file=SPARSITY_CONFIG, **kwargs ) + class ITREXQuantizationConfigMixin(QuantizationConfig): """Mixin class for quantization config.""" @@ -258,7 +262,9 @@ def update(self, **kwargs): to_remove.append(key) # Remove all the attributes that were updated, without modifying the input dict - unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove} + unused_kwargs = { + key: value for key, value in kwargs.items() if key not in to_remove + } return unused_kwargs def post_init_cpu(self): @@ -307,18 +313,18 @@ def post_init_cpu(self): "fp8_e4m3", ]: raise ValueError( - f"weight_dtype must be a string in " - f"'int8', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'" + "weight_dtype must be a string in " + "'int8', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'" ) if self.scale_dtype is not None and self.scale_dtype not in [ "fp32", "fp8_e8m0", - "bf16" + "bf16", ]: raise ValueError( - f"scale_dtype must be a string in 'fp32', 'fp8_e8m0', 'bf16' " - f"and fp8_e8m0 only used for weight_dtype 'fp8_e5m2', 'fp8_e4m3'" + "scale_dtype must be a string in 'fp32', 'fp8_e8m0', 'bf16' " + "and fp8_e8m0 only used for weight_dtype 'fp8_e5m2', 'fp8_e4m3'" ) elif self.scale_dtype is None: self.scale_dtype = "fp32" @@ -345,9 +351,9 @@ def post_init_cpu(self): or self.scale_dtype != "fp32" ): raise ValueError( - f"WeightOnlyQuantization doesn't support asym with " - f"compute_dtype int8 or weight_dtype float or scale_dtype non-fp32 now, " - f"please use sym scheme" + "WeightOnlyQuantization doesn't support asym with " + "compute_dtype int8 or weight_dtype float or scale_dtype non-fp32 now, " + "please use sym scheme" ) self.use_neural_speed = False @@ -375,10 +381,12 @@ def post_init_xpu(self): elif self.weight_dtype not in [ "int4_fullrange", ]: - raise ValueError(f"weight_dtype must be a string in 'int4_fullrange', but get {self.weight_dtype}.") + raise ValueError( + f"weight_dtype must be a string in 'int4_fullrange', but get {self.weight_dtype}." + ) if self.scale_dtype is not None and self.scale_dtype not in ["fp16"]: - raise ValueError(f"scale_dtype must be a string in 'fp16'") + raise ValueError("scale_dtype must be a string in 'fp16'") elif self.scale_dtype is None: self.scale_dtype = "fp16" @@ -411,7 +419,7 @@ def post_init_runtime(self): runtime_supported_weight_dtype = [ "int4", "int4_clip", # int4_clip will merge to int4 in next release. - "int4_fullrange", # int4_fullrange will merge to int4 in next release. + "int4_fullrange", # int4_fullrange will merge to int4 in next release. "int8", "fp8", "fp8_e5m2", @@ -535,13 +543,48 @@ def to_json_file( writer.write(self.to_json_string(use_diff=use_diff)) def remove_redundant_parameters(self): - remove_parameters = ["calib_dataloader", "dataset", "calib_func", "calib_iters", "calib_len", - "double_quant_scale_dtype", "use_double_quant", "mse_range", "scheme", "tokenizer", "use_ggml", - "use_neural_speed", "use_quant", "layer_wise", "blocksize", "nsamples", "max_input_length", "static_groups", - "lr", "minmax_lr", "iters", "use_quant_input", "device", "calib_dataset", "calib_pad_val", "calib_shuffle", - "calib_padding", "example_inputs", "excluded_precisions", "op_name_dict", "op_type_dict", "train_dataloader", - "train_func", "train_iters", "train_len", "train_padding", "train_dataset", "train_pad_val", "train_shuffle", - "train_batch_size"] + remove_parameters = [ + "calib_dataloader", + "dataset", + "calib_func", + "calib_iters", + "calib_len", + "double_quant_scale_dtype", + "use_double_quant", + "mse_range", + "scheme", + "tokenizer", + "use_ggml", + "use_neural_speed", + "use_quant", + "layer_wise", + "blocksize", + "nsamples", + "max_input_length", + "static_groups", + "lr", + "minmax_lr", + "iters", + "use_quant_input", + "device", + "calib_dataset", + "calib_pad_val", + "calib_shuffle", + "calib_padding", + "example_inputs", + "excluded_precisions", + "op_name_dict", + "op_type_dict", + "train_dataloader", + "train_func", + "train_iters", + "train_len", + "train_padding", + "train_dataset", + "train_pad_val", + "train_shuffle", + "train_batch_size", + ] for parameter in remove_parameters: if hasattr(self, parameter): delattr(self, parameter) @@ -604,24 +647,25 @@ def get_config_dict( pretrained_model_name_or_path, _configuration_file=cf, **kwargs ) + class QuantAwareTrainingConfig(ITREXQuantizationConfigMixin): def __init__( - self, - backend="default", - tokenizer=None, - train_dataset="NeelNanda/pile-10k", - train_dataloader=None, - train_func=None, - train_shuffle=True, - train_iters=100, - train_padding=True, - train_batch_size=8, - train_len=512, - train_pad_val=1, - op_name_dict=None, - op_type_dict=None, - excluded_precisions=[], - **kwargs, + self, + backend="default", + tokenizer=None, + train_dataset="NeelNanda/pile-10k", + train_dataloader=None, + train_func=None, + train_shuffle=True, + train_iters=100, + train_padding=True, + train_batch_size=8, + train_len=512, + train_pad_val=1, + op_name_dict=None, + op_type_dict=None, + excluded_precisions=[], + **kwargs, ): self.quant_method = QuantizationMethod.QuantAwareTraining self.backend = backend @@ -642,35 +686,36 @@ def __init__( class DynamicQuantConfig(ITREXQuantizationConfigMixin): def __init__( - self, - excluded_precisions=[], - op_name_dict=None, - op_type_dict=None, - **kwargs, + self, + excluded_precisions=[], + op_name_dict=None, + op_type_dict=None, + **kwargs, ): self.quant_method = QuantizationMethod.DYNAMIC self.excluded_precisions = excluded_precisions self.op_name_dict = op_name_dict self.op_type_dict = op_type_dict + class StaticQuantConfig(ITREXQuantizationConfigMixin): def __init__( - self, - backend="default", - tokenizer=None, - calib_dataset="NeelNanda/pile-10k", - calib_dataloader=None, - calib_func=None, - calib_shuffle=True, - calib_iters=100, - calib_padding=False, - calib_len=512, - calib_pad_val=1, - op_name_dict=None, - op_type_dict=None, - excluded_precisions=[], - example_inputs=None, - **kwargs, + self, + backend="default", + tokenizer=None, + calib_dataset="NeelNanda/pile-10k", + calib_dataloader=None, + calib_func=None, + calib_shuffle=True, + calib_iters=100, + calib_padding=False, + calib_len=512, + calib_pad_val=1, + op_name_dict=None, + op_type_dict=None, + excluded_precisions=[], + example_inputs=None, + **kwargs, ): self.quant_method = QuantizationMethod.STATIC self.backend = backend @@ -688,30 +733,31 @@ def __init__( self.excluded_precisions = excluded_precisions self.example_inputs = example_inputs + class SmoothQuantConfig(ITREXQuantizationConfigMixin): def __init__( - self, - tokenizer=None, - dataset="NeelNanda/pile-10k", - alpha=0.5, - scale_sharing = False, - init_alpha = 0.5, - alpha_min = 0.0, - alpha_max = 1.0, - alpha_step = 0.1, - shared_criterion = "max", - do_blockwise = False, - auto_alpha_args = None, - n_samples=100, - excluded_precisions=[], - ipex_opt_llm=None, - num_beams=1, - **kwargs, - + self, + tokenizer=None, + dataset="NeelNanda/pile-10k", + alpha=0.5, + scale_sharing=False, + init_alpha=0.5, + alpha_min=0.0, + alpha_max=1.0, + alpha_step=0.1, + shared_criterion="max", + do_blockwise=False, + auto_alpha_args=None, + n_samples=100, + seq_len=512, + excluded_precisions=[], + ipex_opt_llm=None, + num_beams=1, + **kwargs, ): self.quant_method = QuantizationMethod.SmoothQuant self.dataset = dataset - self.tokenizer=tokenizer + self.tokenizer = tokenizer self.alpha = alpha self.scale_sharing = scale_sharing self.init_alpha = init_alpha @@ -721,11 +767,13 @@ def __init__( self.shared_criterion = shared_criterion self.do_blockwise = do_blockwise self.auto_alpha_args = auto_alpha_args - self.n_samples=n_samples + self.n_samples = n_samples + self.seq_len = seq_len self.ipex_opt_llm = ipex_opt_llm self.num_beams = num_beams self.excluded_precisions = excluded_precisions + class RtnConfig(ITREXQuantizationConfigMixin): def __init__( self, @@ -803,6 +851,7 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict + class GPTQConfig(ITREXQuantizationConfigMixin): def __init__( self, @@ -917,6 +966,7 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict + class AwqConfig(ITREXQuantizationConfigMixin): def __init__( self, @@ -984,6 +1034,7 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict + class TeqConfig(ITREXQuantizationConfigMixin): def __init__( self, @@ -1046,6 +1097,7 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict + class AutoRoundConfig(ITREXQuantizationConfigMixin): def __init__( self, From 4dc368ae1bb8173b9db6ec7f375173100ef9c560 Mon Sep 17 00:00:00 2001 From: "Ye, Xinyu" Date: Fri, 24 May 2024 01:47:30 -0400 Subject: [PATCH 07/28] added HQQ for WOQ in ITREX. Signed-off-by: Ye, Xinyu --- .../quantization/run_generation_cpu_woq.py | 9 +++- .../transformers/__init__.py | 1 + .../transformers/llm/quantization/utils.py | 15 +++--- .../transformers/modeling/modeling_auto.py | 9 ++-- .../transformers/utils/__init__.py | 1 + .../transformers/utils/config.py | 52 +++++++++++++++++++ 6 files changed, 76 insertions(+), 11 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index a373f36e848..9a455c2fb5e 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -12,6 +12,7 @@ from intel_extension_for_transformers.transformers import ( BitsAndBytesConfig, RtnConfig, + HQQConfig, AwqConfig, TeqConfig, GPTQConfig, @@ -47,7 +48,7 @@ parser.add_argument( "--woq_algo", default="Rtn", - choices=["Rtn", "Awq", "Teq", "GPTQ", "AutoRound"], + choices=["Rtn", "Awq", "Teq", "GPTQ", "AutoRound", "HQQ"], help="Weight-only algorithm.", ) parser.add_argument( @@ -217,6 +218,12 @@ layer_wise=args.layer_wise, use_ipex=args.use_ipex, ) + elif args.woq_algo == "HQQ": + quantization_config = HQQConfig( + bits=args.bits, + group_size=args.group_size, + use_ipex=args.use_ipex, + ) elif args.woq_algo == "Awq": quantization_config = AwqConfig( tokenizer=tokenizer, diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 54e9e3f60fb..014335155b4 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -48,6 +48,7 @@ DynamicQuantConfig, QuantAwareTrainingConfig, RtnConfig, + HQQConfig, AwqConfig, TeqConfig, GPTQConfig, diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 2bc0425e65b..1e7668932ba 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -490,10 +490,6 @@ def convert_to_quantized_model(model, config, device="cpu"): dtype = config.weight_dtype # mapping to INC config if config.quant_method.value == "rtn": - export_compressed_model = False - if (device == "cpu" or device == torch.device("cpu")) \ - and config.weight_dtype not in ["nf4", "fp4", "int4_fullrange"]: - export_compressed_model = True quant_config = RTNConfig( dtype=config.weight_dtype, bits=config.bits, @@ -502,7 +498,6 @@ def convert_to_quantized_model(model, config, device="cpu"): group_dim=config.group_dim, use_full_range=config.use_full_range, use_mse_search=config.mse_range, - export_compressed_model=export_compressed_model, use_layer_wise=config.layer_wise, model_path=config.model_path, use_double_quant=config.use_double_quant, @@ -515,8 +510,16 @@ def convert_to_quantized_model(model, config, device="cpu"): model = convert(model) elif config.quant_method.value == "hqq": quant_config = HQQConfig( - + bits=config.bits, + group_size=config.group_size, + quant_zero=config.quant_zero, + quant_scale=config.quant_scale, + scale_quant_group_size=config.scale_quant_group_size, + skip_lm_head=config.skip_lm_head, ) + model = prepare(model, quant_config) + model = convert(model) + return model elif config.quant_method.value == "awq": quant_config = AWQConfig( diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index cca3b40abcf..1e8b45335f9 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -46,6 +46,7 @@ DynamicQuantConfig, QuantAwareTrainingConfig, RtnConfig, + HQQConfig, AwqConfig, TeqConfig, GPTQConfig, @@ -618,7 +619,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): logger.info("Mixed Precision done.") elif isinstance( quantization_config, - (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), + (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig, HQQConfig), ): logger.info("Applying Weight Only Quantization.") if use_neural_speed: @@ -1000,9 +1001,9 @@ def calib_func(model): white_list=quantizate_config.white_list, ) - model = quantize(model, - quant_config=quant_config, - run_fn=run_fn, + model = quantize(model, + quant_config=quant_config, + run_fn=run_fn, example_inputs=example_inputs ) logger.info("SmoothQuant done.") diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index 4eaba5a00fe..7d810beca21 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -25,6 +25,7 @@ QuantAwareTrainingConfig, SparsityConfig, RtnConfig, + HQQConfig, AwqConfig, TeqConfig, GPTQConfig, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 04aa729f207..6354b92aa79 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -46,6 +46,7 @@ class QuantizationMethod(str, Enum): AWQ = "awq" AQLM = "aqlm" RTN = "rtn" + HQQ = "hqq" AUTOROUND = "autoround" TEQ = "teq" DYNAMIC = "dynamic" @@ -810,6 +811,57 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict +class HQQConfig(ITREXQuantizationConfigMixin): + def __init__( + self, + bits: int = 4, + group_size: int = 64, + quant_zero: bool = True, + quant_scale: bool = False, + scale_quant_group_size: int = 128, + skip_lm_head: bool = True, + **kwargs, + ): + self.quant_method = QuantizationMethod.HQQ + self.bits = bits + self.weight_dtype = None + self.compute_dtype = None + self.scale_dtype = None + self.use_double_quant = False + self.scheme = "" + self.group_size = group_size + self.quant_zero = quant_zero + self.quant_scale = quant_scale + self.scale_quant_group_size = scale_quant_group_size + self.skip_lm_head = skip_lm_head + self.device = kwargs.get("device", "auto") + self.calib_dataloader = None + self.dataset = None + self.calib_func = None + self.calib_iters = None + self.use_ipex = kwargs.pop("use_ipex", False) + + def to_diff_dict(self) -> Dict[str, Any]: + """Removes all attributes from config which correspond to the default config attributes + for better readability and serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = RtnConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + class GPTQConfig(ITREXQuantizationConfigMixin): def __init__( self, From a7cf2c1a42fd46221d6b3a95720d639770c64757 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 27 May 2024 20:27:08 -0700 Subject: [PATCH 08/28] support chatglm,qwen,baichuan sq Signed-off-by: changwangss --- .../transformers/llm/quantization/sq_utils.py | 3 +++ .../transformers/modeling/modeling_auto.py | 11 ++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index b914f89425c..fc8f4d12cdf 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -99,6 +99,7 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): config.num_attention_heads, config.hidden_size // config.num_attention_heads, ] + num_layers = config.num_layers elif config.model_type == "baichuan": new_shape = [ input_bs, @@ -106,6 +107,7 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): 1, config.hidden_size // config.num_attention_heads, ] + num_layers = config.num_layers elif config.model_type == "chatglm": new_shape = [ 1, @@ -113,6 +115,7 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): config.num_attention_heads, config.hidden_size // config.num_attention_heads, ] + num_layers = config.num_layers else: normalized_config = NormalizedConfigManager.get_normalized_config_class( config.model_type diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index de05e67f3f8..dfd2030852e 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -1490,11 +1490,12 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( TSModelCausalLMForITREX, ) - q_model = TSModelCausalLMForITREX.from_pretrained( - pretrained_model_name_or_path, - file_name=WEIGHTS_NAME, - trust_remote_code=trust_remote_code, - ) + q_model = torch.jit.load(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)) + origin_model_type = config.model_type + if origin_model_type in ["chatglm", "qwen", "baichuan"]: + config.model_type = "qwen2" + q_model = TSModelCausalLMForITREX(q_model, config=config) + q_model.config.model_type = origin_model_type return q_model dtype_orig = None if torch_dtype is not None: From 2a60dfbe4eda0e96c765332d274b4bef60c86427 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 29 May 2024 00:05:37 -0700 Subject: [PATCH 09/28] adapt inc fixed gptq Signed-off-by: changwangss --- .../quantization/run_generation_cpu_woq.py | 10 +++++----- .../transformers/llm/quantization/nn/modules.py | 7 ------- .../transformers/llm/quantization/sq_utils.py | 8 ++++++-- .../transformers/llm/quantization/utils.py | 11 +++++++---- .../transformers/utils/config.py | 8 ++++---- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index ac45ea78b2d..6140ea9d6c8 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -116,10 +116,10 @@ help="Block size. sub weight matrix size to run GPTQ.", ) parser.add_argument( - "--nsamples", type=int, default=512, help="Number of calibration data samples." + "--n_samples", type=int, default=512, help="Number of calibration data samples." ) parser.add_argument( - "--max_input_length", + "--seq_len", type=int, default=2048, help="Calibration dataset sequence max length, this should align with your model config", @@ -266,10 +266,10 @@ damp_percent=args.damp_percent, sym=True if args.scheme == "sym" else False, blocksize=args.blocksize, - nsamples=args.nsamples, + n_samples=args.n_samples, static_groups=args.static_groups, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, @@ -284,7 +284,7 @@ dataset=args.dataset, bits=args.bits, sym=True if args.scheme == "sym" else False, - nsamples=args.nsamples, + n_samples=args.n_samples, group_size=args.group_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py index abebd077010..664118379c4 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py @@ -194,8 +194,6 @@ def set_weights_bias( if q_config.quant_method.value == "gptq": if q_config.desc_act: - print("before qbits g_idx") - print(g_idx) if not q_config.static_groups: int_weight2 = int_weight.clone() group_size = q_config.group_size @@ -299,7 +297,6 @@ def recover_idx(ret_idx, k, blocksize): for i in range(value_range): for j in range(blocksize): g_idx[ret_idx[i * blocksize + j]] = i - print(ret_idx[i * blocksize + j]) return g_idx def recover_int_weight(g_idx, int_weight): @@ -327,11 +324,7 @@ def recover_int_weight(g_idx, int_weight): desc_act = qbits.acquire_packed_weight_info(self.weight, 4)[0] != 0 if desc_act: g_idx = qbits.acquire_packed_weight_info(self.weight, 5) - print("qbits recover g_idx") - print(g_idx) g_idx = recover_idx(g_idx, in_features, group_size) - print("postprocess recover g_idx") - print(g_idx) else: g_idx = None weight_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 6) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index fc8f4d12cdf..1ffb7b47001 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -41,6 +41,7 @@ "falcon", "chatglm", "baichuan", + "qwen", "bloom", "codegen", "gptbigcode", @@ -99,7 +100,7 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): config.num_attention_heads, config.hidden_size // config.num_attention_heads, ] - num_layers = config.num_layers + num_layers = config.num_hidden_layers elif config.model_type == "baichuan": new_shape = [ input_bs, @@ -107,7 +108,7 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): 1, config.hidden_size // config.num_attention_heads, ] - num_layers = config.num_layers + num_layers = config.num_hidden_layers elif config.model_type == "chatglm": new_shape = [ 1, @@ -165,6 +166,7 @@ def generate_dummy_past_key_values(config, input_bs): config.num_attention_heads, config.hidden_size // config.num_attention_heads, ] + num_layers = config.num_hidden_layers elif config.model_type == "baichuan": new_shape = [ input_bs, @@ -172,6 +174,7 @@ def generate_dummy_past_key_values(config, input_bs): 1, config.hidden_size // config.num_attention_heads, ] + num_layers = config.num_hidden_layers elif config.model_type == "chatglm": new_shape = [ 1, @@ -179,6 +182,7 @@ def generate_dummy_past_key_values(config, input_bs): config.num_attention_heads, config.hidden_size // config.num_attention_heads, ] + num_layers = config.num_layers else: normalized_config = NormalizedConfigManager.get_normalized_config_class( config.model_type diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 0714ea4251b..a190276678e 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -474,6 +474,7 @@ def tokenize_function(examples): tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + tokenized_dataset = tokenized_dataset.filter(lambda x: x['input_ids'].shape[-1] >= max_length) def collate_batch(batch): input_ids_padded = [] @@ -481,12 +482,14 @@ def collate_batch(batch): input_ids = text["input_ids"] if len(input_ids) >= max_length: input_ids = input_ids[:max_length] + input_ids_padded.append(input_ids) else: continue - input_ids_padded.append(input_ids) - + assert input_ids_padded != [], \ + "The dataset does not have data that meets the required input length. Please reduce seq_len." return torch.vstack(input_ids_padded) + calib_dataloader = DataLoader( tokenized_dataset, batch_size=batch_size, @@ -594,8 +597,8 @@ def convert_to_quantized_model(model, config, device="cpu"): run_args = ( config.tokenizer, config.dataset, - config.max_input_length, # max_length - config.nsamples, # n_samples + config.seq_len, # max_length + config.n_samples, # n_samples config.batch_size, # batch_size config.quant_method.value, # algo ) diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 0b533cb20c2..dca0b4e5bd7 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -920,8 +920,8 @@ def __init__( blocksize: int = 128, damp_percent: float = 0.1, desc_act: bool = False, - nsamples: int = 128, - max_input_length: Optional[int] = None, + n_samples: int = 128, + seq_len: Optional[int] = 2048, static_groups: bool = False, true_sequential: bool = True, layer_wise: bool = False, @@ -948,14 +948,14 @@ def __init__( self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype self.blocksize = blocksize - self.nsamples = nsamples + self.n_samples = n_samples self.group_size = group_size self.damp_percent = damp_percent self.desc_act = desc_act self.static_groups = static_groups self.true_sequential = true_sequential self.layer_wise = layer_wise - self.max_input_length = max_input_length + self.seq_len = seq_len self.llm_int8_skip_modules = ( llm_int8_skip_modules if llm_int8_skip_modules else [] ) From eb63429c9c3b1b0f281a28871c2616c9ad8d9167 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 29 May 2024 03:38:08 -0700 Subject: [PATCH 10/28] migraate awq teq Signed-off-by: changwangss --- .../quantization/llm_quantization_recipes.md | 68 ++++++++-------- .../quantization/run_generation_cpu_woq.py | 81 +++++++++---------- .../transformers/llm/quantization/utils.py | 55 +++++++++++-- .../transformers/utils/config.py | 34 +++++--- 4 files changed, 145 insertions(+), 93 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md index 86e98ff8e57..bbbe2ac3352 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md +++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md @@ -82,7 +82,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -96,7 +96,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -136,7 +136,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 128 \ --accuracy @@ -149,7 +149,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -190,7 +190,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -203,7 +203,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -249,7 +249,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -262,7 +262,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -308,7 +308,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -321,7 +321,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -361,7 +361,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -374,7 +374,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -415,7 +415,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -428,7 +428,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -467,7 +467,7 @@ python run_generation_cpu_woq.py \ --woq_algo GPTQ \ --bits 4 \ --weight_dtype int4_clip \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -480,7 +480,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -521,7 +521,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -534,7 +534,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -575,7 +575,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -588,7 +588,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -629,7 +629,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -642,7 +642,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -682,7 +682,7 @@ python run_generation_cpu_woq.py \ --woq_algo GPTQ \ --bits 4 \ --weight_dtype int4_clip \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme asym \ --group_size 32 \ --accuracy @@ -695,7 +695,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -735,7 +735,7 @@ python run_generation_cpu_woq.py \ --woq_algo GPTQ \ --bits 4 \ --weight_dtype int4_clip \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --nsamples 256 \ @@ -749,7 +749,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -789,7 +789,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -802,7 +802,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -841,7 +841,7 @@ python run_generation_cpu_woq.py \ --woq_algo GPTQ \ --bits 4 \ --weight_dtype int4_clip \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme asym \ --group_size 32 \ --accuracy @@ -854,7 +854,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -895,7 +895,7 @@ python run_generation_cpu_woq.py \ --bits 4 \ --weight_dtype int4_clip \ --desc_act \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 32 \ --accuracy @@ -908,7 +908,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy @@ -947,7 +947,7 @@ python run_generation_cpu_woq.py \ --woq_algo GPTQ \ --bits 4 \ --weight_dtype int4_clip \ - --max_input_length 2048 \ + --seq_len 2048 \ --scheme sym \ --group_size 128 \ --accuracy @@ -960,7 +960,7 @@ python run_generation_cpu_woq.py \ --woq_algo AutoRound \ --bits 4 \ --weight_dtype int4_clip \ - --calib_iters 200 \ + --autoround_iters 200 \ --scheme asym \ --group_size 128 \ --accuracy diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index 6140ea9d6c8..91a47600464 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -32,11 +32,12 @@ parser.add_argument("--use_ipex", action="store_true") # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") -parser.add_argument("--iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iters for benchmark") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="batch size for benchmark") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") # ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=56, type=int, help="batch size num.") +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num for evaluation.") parser.add_argument( "--tasks", default="lambada_openai", @@ -91,7 +92,21 @@ action="store_true", help="Use layer wise to do quantization", ) -parser.add_argument("--woq_loading", action="store_true") +parser.add_argument( + "--n_samples", type=int, default=512, help="Number of calibration data samples." +) +parser.add_argument( + "--seq_len", + type=int, + default=2048, + help="Calibration dataset sequence max length, this should align with your model config", +) +parser.add_argument( + "--batch_size", + type=int, + default=8, + help="Calibration batchsize.", +) # ============GPTQ configs============== parser.add_argument( "--desc_act", @@ -115,33 +130,12 @@ default=128, help="Block size. sub weight matrix size to run GPTQ.", ) -parser.add_argument( - "--n_samples", type=int, default=512, help="Number of calibration data samples." -) -parser.add_argument( - "--seq_len", - type=int, - default=2048, - help="Calibration dataset sequence max length, this should align with your model config", -) parser.add_argument( "--static_groups", action="store_true", help="Use determined group to do quantization", ) # ============AUTOROUND configs============== -parser.add_argument( - "--calib_len", - type=int, - default=2048, - help="Calibration dataset sequence max length, this should align with your model config", -) -parser.add_argument( - "--calib_iters", - type=int, - default=200, - help="Calibration inference iterations", -) parser.add_argument( "--lr", type=float, @@ -154,6 +148,7 @@ default=None, help="minmax learning rate, if None,it will beset to be the same with lr", ) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") parser.add_argument( "--enable_quanted_input", action="store_true", @@ -236,11 +231,12 @@ bits=args.bits, zero_point=False if args.scheme == "sym" else True, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, + n_samples=args.n_samples, + batch_size=args.batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, use_ipex=args.use_ipex, ) elif args.woq_algo == "Teq": @@ -250,11 +246,12 @@ bits=args.bits, sym=True if args.scheme == "sym" else False, group_size=args.group_size, - max_input_length=args.max_input_length, + seq_len=args.seq_len, + batch_size=args.batch_size, + n_samples=args.n_samples, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, use_ipex=args.use_ipex, ) elif args.woq_algo == "GPTQ": @@ -266,14 +263,14 @@ damp_percent=args.damp_percent, sym=True if args.scheme == "sym" else False, blocksize=args.blocksize, - n_samples=args.n_samples, static_groups=args.static_groups, group_size=args.group_size, + n_samples=args.n_samples, seq_len=args.seq_len, + batch_size=args.batch_size, compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - calib_iters=args.calib_iters, layer_wise=args.layer_wise, true_sequential=args.true_sequential, use_ipex=args.use_ipex, @@ -289,8 +286,8 @@ compute_dtype=args.compute_dtype, scale_dtype=args.scale_dtype, weight_dtype=args.weight_dtype, - iters=args.calib_iters, - calib_len=args.calib_len, + iters=args.autoround_iters, + seq_len=args.seq_len, lr=args.lr, minmax_lr=args.minmax_lr, enable_quanted_input=args.enable_quanted_input, @@ -329,11 +326,11 @@ print("Didn't do Weight Only Quantization.") # save model -# if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): -# user_model.save_pretrained(args.output_dir) -# tokenizer.save_pretrained(args.output_dir) -# # to validate woq model accuracy -# args.model = args.output_dir +if args.output_dir is not None and ((args.woq or args.load_in_4bit or args.load_in_8bit) and not args.use_neural_speed): + user_model.save_pretrained(args.output_dir) + tokenizer.save_pretrained(args.output_dir) + # to validate woq model accuracy + args.model = args.output_dir if args.benchmark: print("Loading model from: ", args.model) @@ -351,7 +348,7 @@ # start total_time = 0.0 - num_iter = args.iters + num_iter = args.benchmark_iters num_warmup = args.num_warmup total_token_num = 0 eos_token_id = tokenizer.eos_token_id @@ -361,7 +358,7 @@ # tokenizer for chatglm2. if hasattr(tokenizer, "build_chat_input"): input_ids = tokenizer.build_chat_input(prompt)["input_ids"] - input_ids = input_ids.repeat(args.batch_size, 1) + input_ids = input_ids.repeat(args.benchmark_batch_size, 1) eos_token_id = [ tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), @@ -371,11 +368,11 @@ elif hasattr(tokenizer, "build_prompt"): build_prompt = tokenizer.build_prompt(prompt) input_ids = tokenizer( - [build_prompt] * args.batch_size, return_tensors="pt" + [build_prompt] * args.benchmark_batch_size, return_tensors="pt" ).input_ids else: input_ids = tokenizer( - [prompt] * args.batch_size, return_tensors="pt" + [prompt] * args.benchmark_benchmark_batch_size, return_tensors="pt" ).input_ids gen_ids = user_model.generate( input_ids, @@ -408,7 +405,7 @@ model_args=model_args, tasks = args.tasks, device = "cpu", - batch_size = args.batch_size) + batch_size = args.eval_batch_size) results = evaluate(args) for task_name in args.tasks.split(","): if task_name == "wikitext": diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index a190276678e..718b5ccbb20 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -474,7 +474,7 @@ def tokenize_function(examples): tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - tokenized_dataset = tokenized_dataset.filter(lambda x: x['input_ids'].shape[-1] >= max_length) + tokenized_dataset = tokenized_dataset.filter(lambda x: x["input_ids"].shape[-1] >= max_length) def collate_batch(batch): input_ids_padded = [] @@ -520,7 +520,6 @@ def convert_to_quantized_model(model, config, device="cpu"): hasattr(torch, "xpu") and torch.xpu.is_available() ), "There is no xpu device in this system!" - model_device = next(model.parameters()).device orig_dtype = torch.float32 for param in model.parameters(): orig_dtype = param.dtype @@ -570,13 +569,59 @@ def convert_to_quantized_model(model, config, device="cpu"): model = convert(model) return model elif config.quant_method.value == "awq": - quant_config = AWQConfig() + quant_config = AWQConfig( + dtype=dtype, + bits=config.bits, + use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + use_auto_scale=config.auto_scale, + use_auto_clip=config.auto_clip, + ) + quant_config.set_local(".*lm_head", AWQConfig(dtype="fp32")) + quant_config.set_local(".*output_layer", AWQConfig(dtype="fp32")) + quant_config.set_local(".*embed_out", AWQConfig(dtype="fp32")) + logger.info(f"Do AWQ algorithm with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.seq_len, # max_length + config.n_samples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo + ) + example_inputs = torch.ones([1, 512], dtype=torch.long).to(device) + model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(model, *run_args) + model = convert(model) elif config.quant_method.value == "teq": quant_config = TEQConfig( dtype=dtype, bits=config.bits, use_sym=config.sym, + group_size=config.group_size, + use_layer_wise=config.layer_wise, + absorb_to_layer=config.absorb_to_layer ) + assert config.absorb_to_layer != {}, "absorb_to_layer is necessary for TEQ algorithm" + quant_config.set_local(".*lm_head", TEQConfig(dtype="fp32")) + quant_config.set_local(".*output_layer", TEQConfig(dtype="fp32")) + quant_config.set_local(".*embed_out", TEQConfig(dtype="fp32")) + logger.info(f"Do TEQ algorithm with config {quant_config}") + run_fn = default_run_fn + run_args = ( + config.tokenizer, + config.dataset, + config.seq_len, # max_length + config.n_samples, # n_samples + config.batch_size, # batch_size + config.quant_method.value, # algo + ) + example_inputs = torch.ones([1, 512], dtype=torch.long).to(device) + model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(model, *run_args) + model = convert(model) elif config.quant_method.value == "gptq": quant_config = GPTQConfig( dtype=dtype, @@ -592,7 +637,7 @@ def convert_to_quantized_model(model, config, device="cpu"): quant_config.set_local(".*lm_head", GPTQConfig(dtype="fp32")) quant_config.set_local(".*output_layer", GPTQConfig(dtype="fp32")) quant_config.set_local(".*embed_out", GPTQConfig(dtype="fp32")) - logger.info(f"Do GPTQ with config {quant_config}") + logger.info(f"Do GPTQ algorithm with config {quant_config}") run_fn = default_run_fn run_args = ( config.tokenizer, @@ -621,7 +666,7 @@ def convert_to_quantized_model(model, config, device="cpu"): quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) quant_config.set_local(".*output_layer", AutoRoundConfig(dtype="fp32")) quant_config.set_local(".*embed_out", AutoRoundConfig(dtype="fp32")) - logger.info(f"Do AutoRound with config {quant_config}") + logger.info(f"Do AutoRound algorithm with config {quant_config}") run_fn = get_autoround_default_run_fn run_args = ( config.tokenizer, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index dca0b4e5bd7..fc06eb6b31d 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -1030,10 +1030,14 @@ def __init__( compute_dtype: Any = None, weight_dtype: Any = None, scale_dtype: Any = None, + layer_wise: bool = False, + n_samples: int = 128, + seq_len: Optional[int] = 2048, + auto_scale: bool = True, + auto_clip: bool = True, use_double_quant=False, double_quant_scale_dtype=None, # reserve for double quant zero_point: bool = True, - mse_range: bool = False, use_ggml: bool = False, use_quant: bool = True, use_neural_speed: bool = False, @@ -1049,7 +1053,11 @@ def __init__( self.scale_dtype = scale_dtype self.group_size = group_size self.zero_point = zero_point - self.mse_range = mse_range + self.auto_scale = auto_scale + self.auto_clip = auto_clip + self.layer_wise = layer_wise + self.n_samples = n_samples + self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype self.llm_int8_skip_modules = ( @@ -1059,11 +1067,9 @@ def __init__( self.use_quant = use_quant self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") - self.calib_dataloader = kwargs.get("calib_dataloader", None) - self.calib_func = kwargs.get("calib_func", None) - self.calib_iters = kwargs.get("calib_iters", 100) self.scheme = "asym" if self.zero_point else "sym" self.sym = True if not self.zero_point else False + self.batch_size = kwargs.pop("batch_size", 8) self.use_ipex = kwargs.pop("use_ipex", False) def to_diff_dict(self) -> Dict[str, Any]: @@ -1098,6 +1104,9 @@ def __init__( compute_dtype: Any = None, weight_dtype: Any = None, scale_dtype: Any = None, + layer_wise: bool = False, + n_samples: int = 128, + seq_len: Optional[int] = 2048, use_double_quant=False, double_quant_scale_dtype=None, # reserve for double quant sym: bool = True, @@ -1116,6 +1125,9 @@ def __init__( self.group_size = group_size self.sym = sym self.scheme = "sym" if self.sym else "asym" + self.layer_wise = layer_wise + self.n_samples = n_samples + self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype self.llm_int8_skip_modules = ( @@ -1124,9 +1136,7 @@ def __init__( self.use_ggml = use_ggml self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") - self.calib_dataloader = kwargs.get("calib_dataloader", None) - self.calib_func = kwargs.get("calib_func", None) - self.calib_iters = kwargs.get("calib_iters", 100) + self.batch_size = kwargs.pop("batch_size", 8) self.use_ipex = kwargs.pop("use_ipex", False) def to_diff_dict(self) -> Dict[str, Any]: @@ -1167,8 +1177,8 @@ def __init__( lr: float = None, minmax_lr: float = None, enable_quanted_input: bool = True, - nsamples: int = 512, - iters: int = 200, + n_samples: int = 512, + batch_size: int = 200, use_ggml: bool = False, use_neural_speed: bool = False, llm_int8_skip_modules=None, @@ -1189,12 +1199,12 @@ def __init__( self.sym = sym self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.nsamples = nsamples + self.n_samples = n_samples self.group_size = group_size self.lr = lr self.minmax_lr = minmax_lr self.enable_quanted_input = enable_quanted_input - self.iters = iters + self.batch_size = batch_size self.llm_int8_skip_modules = ( llm_int8_skip_modules if llm_int8_skip_modules else [] ) From 0e40185629cc405dd41f771718f3e83b0c5aad41 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 29 May 2024 03:53:08 -0700 Subject: [PATCH 11/28] rebase autoround Signed-off-by: changwangss --- .../transformers/llm/quantization/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 718b5ccbb20..1920e83dacc 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -656,11 +656,12 @@ def convert_to_quantized_model(model, config, device="cpu"): bits=config.bits, use_sym=config.sym, group_size=config.group_size, - use_quant_input=config.use_quant_input, + use_quant_input= not config.disable_quanted_input, lr=config.lr, minmax_lr=config.minmax_lr, - seqlen=config.max_input_length, - n_samples=config.nsamples, + seqlen=config.seq_len, + n_samples=config.n_samples, + iters=config.autoround_iters, scale_dtype=config.scale_dtype, ) quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) From 7ae4aec128376752d3349e3bfa6d532dbbea27b3 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 30 May 2024 01:09:32 -0700 Subject: [PATCH 12/28] adapt inc 3.x weightonlylinear Signed-off-by: changwangss --- .../quantization/run_generation_cpu_woq.py | 8 +++-- .../llm/quantization/nn/modules.py | 4 +-- .../transformers/llm/quantization/utils.py | 33 +++++-------------- .../transformers/modeling/modeling_auto.py | 12 +++---- .../transformers/utils/config.py | 28 ++++++---------- 5 files changed, 32 insertions(+), 53 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index 7f8385c200b..ecf79bbc3f2 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -222,6 +222,9 @@ quantization_config = HQQConfig( bits=args.bits, group_size=args.group_size, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, use_ipex=args.use_ipex, ) elif args.woq_algo == "Awq": @@ -341,6 +344,7 @@ _commit_hash=args._commit_hash, use_neural_speed=args.use_neural_speed, ) + user_model = user_model.eval() if hasattr(user_model, "eval") else user_model prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun." input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1) @@ -372,7 +376,7 @@ ).input_ids else: input_ids = tokenizer( - [prompt] * args.benchmark_benchmark_batch_size, return_tensors="pt" + [prompt] * args.benchmark_batch_size, return_tensors="pt" ).input_ids gen_ids = user_model.generate( input_ids, @@ -401,7 +405,7 @@ model_args="pretrained="+args.model+",trust_remote_code="+str(args.trust_remote_code) if args.use_neural_speed: model_args += ",model_format=neural_speed" - args = LMEvalParser(model = "hf", + args = LMEvalParser(model = "hf", model_args=model_args, tasks = args.tasks, device = "cpu", diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py index 0e073b258bc..70c3faa2d3b 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py @@ -17,7 +17,6 @@ import os import torch -from ..utils import DTYPE_BITS_MAPPING from functools import reduce from operator import mul from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING, PeftType @@ -99,6 +98,7 @@ def __init__( compute_dtype="fp32", compress_statistics=True, weight_dtype="int4_clip", + bits=4, scale_dtype="fp32", blocksize=32, scheme="sym", @@ -115,7 +115,7 @@ def __init__( self.blocksize = blocksize self.scheme = scheme self.weight_dtype = weight_dtype - self.bits = DTYPE_BITS_MAPPING[weight_dtype] + self.bits = bits self.scale_dtype = scale_dtype self.double_quant_scale_dtype = double_quant_scale_dtype self.compression_dim = compression_dim diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 1920e83dacc..9542ba6d34c 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -71,17 +71,6 @@ logger = logging.getLogger(__name__) -DTYPE_BITS_MAPPING = { - "nf4": 4, - "fp4_e2m1_bnb": 4, - "fp4_e2m1": 4, - "int4_fullrange": 4, - "int4_clip": 4, - "fp8_e5m2": 8, - "fp8_e4m3": 8, - "int8": 8, -} - def unpack_weight(qweight, scales, qzeros, q_config): sym = q_config.sym @@ -288,6 +277,7 @@ def _replace_linear( compute_dtype=quantization_config.compute_dtype, compress_statistics=False, weight_dtype=quantization_config.weight_dtype, + bits=quantization_config.bits, scale_dtype=quantization_config.scale_dtype, blocksize=quantization_config.group_size, scheme=quantization_config.scheme, @@ -397,7 +387,7 @@ def _replace_linear( else: if not hasattr(module, "qweight"): n_pack = ( - 8 // DTYPE_BITS_MAPPING[quantization_config.weight_dtype] + 8 // quantization_config.bits ) weight = torch.zeros( (math.ceil(out_features / n_pack), in_features), @@ -529,7 +519,6 @@ def convert_to_quantized_model(model, config, device="cpu"): if config.weight_dtype in ["fp8_e4m3", "fp8_e5m2"]: return replace_linear(model, None, None, config, device=device) else: - bits = DTYPE_BITS_MAPPING[config.weight_dtype] if config.weight_dtype == "int8": dtype = "int8" elif "int4" in config.weight_dtype: @@ -539,21 +528,15 @@ def convert_to_quantized_model(model, config, device="cpu"): # mapping to INC config if config.quant_method.value == "rtn": quant_config = RTNConfig( - dtype=config.weight_dtype, + dtype=dtype, bits=config.bits, use_sym=config.sym, group_size=config.group_size, - group_dim=config.group_dim, - use_full_range=config.use_full_range, - use_mse_search=config.mse_range, use_layer_wise=config.layer_wise, - model_path=config.model_path, - use_double_quant=config.use_double_quant, - double_quant_dtype=config.double_quant_dtype, - double_quant_bits=config.double_quant_bits, - double_quant_use_sym=config.double_quant_use_sym, - double_quant_group_size=config.double_quant_group_size, ) + quant_config.set_local(".*lm_head", RTNConfig(dtype="fp32")) + quant_config.set_local(".*output_layer", RTNConfig(dtype="fp32")) + quant_config.set_local(".*embed_out", RTNConfig(dtype="fp32")) model = prepare(model, quant_config) model = convert(model) elif config.quant_method.value == "hqq": @@ -567,7 +550,6 @@ def convert_to_quantized_model(model, config, device="cpu"): ) model = prepare(model, quant_config) model = convert(model) - return model elif config.quant_method.value == "awq": quant_config = AWQConfig( dtype=dtype, @@ -604,7 +586,7 @@ def convert_to_quantized_model(model, config, device="cpu"): use_layer_wise=config.layer_wise, absorb_to_layer=config.absorb_to_layer ) - assert config.absorb_to_layer != {}, "absorb_to_layer is necessary for TEQ algorithm" + assert config.absorb_to_layer != {}, "absorb_to_layer is necessary for TEQ algorithm" quant_config.set_local(".*lm_head", TEQConfig(dtype="fp32")) quant_config.set_local(".*output_layer", TEQConfig(dtype="fp32")) quant_config.set_local(".*embed_out", TEQConfig(dtype="fp32")) @@ -623,6 +605,7 @@ def convert_to_quantized_model(model, config, device="cpu"): run_fn(model, *run_args) model = convert(model) elif config.quant_method.value == "gptq": + model.seqlen = config.seq_len quant_config = GPTQConfig( dtype=dtype, bits=config.bits, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 3ebc9f00ef5..55c930145b9 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -75,7 +75,7 @@ from ...tools.utils import is_intel_gpu_available, is_ipex_available from accelerate import init_empty_weights from huggingface_hub import hf_hub_download -from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear +from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.model.torch_model import PyTorchFXModel from threading import Thread from transformers.configuration_utils import PretrainedConfig @@ -127,12 +127,13 @@ def recover_export_model(model, current_key_name=None): zeros, int_weight, ) = module.recover_qparms() + dtype = "int4" if weight_dtype == "int4_clip" else weight_dtype model._modules[name] = WeightOnlyLinear( in_features, out_features, + dtype=dtype, bits=bits, - groupsize=groupsize, - dtype="int", + group_size=groupsize, zp=zp, bias=module.bias is not None, scale_dtype=scales_dtype, @@ -168,9 +169,9 @@ def build_woq_model(model, quantization_config): new_module = WeightOnlyLinear( m.in_features, m.out_features, - quantization_config.bits, - quantization_config.group_size, dtype="int", + bits=quantization_config.bits, + group_size=quantization_config.group_size, zp=zp, bias=m.bias is not None, g_idx=True, @@ -1779,7 +1780,6 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): # Set model in evaluation mode to deactivate DropOut modules by default model.eval() - if quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 6dbcda1e4c2..0d5a246d1a7 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -786,14 +786,13 @@ def __init__( scale_dtype: Any = None, use_full_range: bool = False, mse_range: bool = False, - use_double_quant=False, + use_double_quant: bool = False, double_quant_dtype: str = "int", double_quant_bits: int = 8, double_quant_use_sym: bool = False, double_quant_group_size: int = 256, sym: bool = True, layer_wise: bool = False, - model_path: str = "", use_ggml: bool = False, use_quant: bool = True, use_neural_speed: bool = False, @@ -810,7 +809,6 @@ def __init__( self.group_size = group_size self.group_dim = group_dim self.layer_wise = layer_wise - self.model_path = model_path self.sym = sym self.scheme = "sym" if self.sym else "asym" self.use_double_quant = use_double_quant @@ -825,10 +823,6 @@ def __init__( self.use_quant = use_quant self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") - self.calib_dataloader = None - self.dataset = None - self.calib_func = None - self.calib_iters = None self.use_ipex = kwargs.pop("use_ipex", False) def to_diff_dict(self) -> Dict[str, Any]: @@ -857,6 +851,10 @@ def __init__( self, bits: int = 4, group_size: int = 64, + sym: bool = True, + compute_dtype: Any = None, + weight_dtype: Any = None, + scale_dtype: Any = None, quant_zero: bool = True, quant_scale: bool = False, scale_quant_group_size: int = 128, @@ -865,21 +863,18 @@ def __init__( ): self.quant_method = QuantizationMethod.HQQ self.bits = bits - self.weight_dtype = None - self.compute_dtype = None - self.scale_dtype = None + self.weight_dtype = weight_dtype + self.compute_dtype = compute_dtype + self.scale_dtype = scale_dtype self.use_double_quant = False - self.scheme = "" + self.sym = sym + self.scheme = "sym" if self.sym else "asym" self.group_size = group_size self.quant_zero = quant_zero self.quant_scale = quant_scale self.scale_quant_group_size = scale_quant_group_size self.skip_lm_head = skip_lm_head self.device = kwargs.get("device", "auto") - self.calib_dataloader = None - self.dataset = None - self.calib_func = None - self.calib_iters = None self.use_ipex = kwargs.pop("use_ipex", False) def to_diff_dict(self) -> Dict[str, Any]: @@ -963,9 +958,6 @@ def __init__( self.use_quant = use_quant self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") - self.calib_dataloader = kwargs.get("calib_dataloader", None) - self.calib_func = kwargs.get("calib_func", None) - self.calib_iters = kwargs.get("calib_iters", 100) self.scheme = "sym" if self.sym else "asym" if isinstance(compute_dtype, torch.dtype): From f6f0ffc5c23d98c40978e57686f5981ab69d0216 Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 31 May 2024 01:05:14 -0700 Subject: [PATCH 13/28] support autoround 0.2 and sq with alpha auto Signed-off-by: changwangss --- .../quantization/run_generation_cpu_woq.py | 10 ++++- .../transformers/__init__.py | 2 +- .../transformers/llm/quantization/utils.py | 43 +++++++++++-------- .../transformers/modeling/modeling_auto.py | 4 +- .../transformers/utils/__init__.py | 2 +- .../transformers/utils/config.py | 40 +++++++---------- 6 files changed, 51 insertions(+), 50 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index ecf79bbc3f2..8ebb18a8756 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -12,7 +12,7 @@ from intel_extension_for_transformers.transformers import ( BitsAndBytesConfig, RtnConfig, - HQQConfig, + HqqConfig, AwqConfig, TeqConfig, GPTQConfig, @@ -154,6 +154,11 @@ action="store_true", help="whether to use the output of quantized block to tune the next block", ) +parser.add_argument( + "--quant_lm_head", + action="store_true", + help="whether to quant the lm head layer", +) # ============BitsAndBytes configs============== parser.add_argument("--bitsandbytes", action="store_true") @@ -219,7 +224,7 @@ use_ipex=args.use_ipex, ) elif args.woq_algo == "HQQ": - quantization_config = HQQConfig( + quantization_config = HqqConfig( bits=args.bits, group_size=args.group_size, compute_dtype=args.compute_dtype, @@ -294,6 +299,7 @@ lr=args.lr, minmax_lr=args.minmax_lr, disable_quanted_input=args.disable_quanted_input, + quant_lm_head = args.quant_lm_head, use_ipex=args.use_ipex, ) else: diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 014335155b4..54154f1f0fc 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -48,7 +48,7 @@ DynamicQuantConfig, QuantAwareTrainingConfig, RtnConfig, - HQQConfig, + HqqConfig, AwqConfig, TeqConfig, GPTQConfig, diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 9542ba6d34c..573b71c9c36 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -23,10 +23,6 @@ from accelerate import init_empty_weights from datasets import load_dataset -from neural_compressor import quantization -from neural_compressor.torch.algorithms.weight_only.autoround import ( - get_autoround_default_run_fn, -) from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear from neural_compressor.torch.quantization import ( AutoRoundConfig, @@ -64,6 +60,7 @@ from auto_round.export.export_to_itrex.model_wrapper import ( WeightOnlyLinear as auto_round_woqlinear, ) # pylint: disable=E0401 + from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader torch = LazyImport("torch") @@ -501,6 +498,15 @@ def collate_batch(batch): except ValueError: pass +@torch.no_grad() +def run_fn_for_autoround(model, dataloader): + for data in dataloader: + if isinstance(data, tuple) or isinstance(data, list): + model(*data) + elif isinstance(data, dict): + model(**data) + else: + model(data) def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): @@ -639,28 +645,27 @@ def convert_to_quantized_model(model, config, device="cpu"): bits=config.bits, use_sym=config.sym, group_size=config.group_size, - use_quant_input= not config.disable_quanted_input, + enable_quanted_input=not config.disable_quanted_input, lr=config.lr, minmax_lr=config.minmax_lr, seqlen=config.seq_len, n_samples=config.n_samples, - iters=config.autoround_iters, + iters=config.iters, scale_dtype=config.scale_dtype, ) - quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) - quant_config.set_local(".*output_layer", AutoRoundConfig(dtype="fp32")) - quant_config.set_local(".*embed_out", AutoRoundConfig(dtype="fp32")) + if config.quant_lm_head is False: + quant_config.set_local(".*lm_head", AutoRoundConfig(dtype="fp32")) + quant_config.set_local(".*output_layer", AutoRoundConfig(dtype="fp32")) + quant_config.set_local(".*embed_out", AutoRoundConfig(dtype="fp32")) logger.info(f"Do AutoRound algorithm with config {quant_config}") - run_fn = get_autoround_default_run_fn - run_args = ( - config.tokenizer, - config.dataset, - quant_config.n_samples, - quant_config.seqlen, - quant_config.seed, - quant_config.batch_size, - "train", - ) + dataloader = get_autoround_dataloader(tokenizer=config.tokenizer, + seqlen=config.seq_len, + dataset_name="NeelNanda/pile-10k", + seed=42, + bs=config.batch_size, + n_samples=config.n_samples) + run_fn = run_fn_for_autoround + run_args = (dataloader,) model = prepare(model=model, quant_config=quant_config) run_fn(model, *run_args) model = convert(model) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 55c930145b9..b460fd60051 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -46,7 +46,7 @@ DynamicQuantConfig, QuantAwareTrainingConfig, RtnConfig, - HQQConfig, + HqqConfig, AwqConfig, TeqConfig, GPTQConfig, @@ -714,7 +714,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: logger.info("Mixed Precision done.") elif isinstance( quantization_config, - (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig, HQQConfig), + (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig, HqqConfig), ): logger.info("Applying Weight Only Quantization.") if use_neural_speed: diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index 7d810beca21..ee258fbd797 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -25,7 +25,7 @@ QuantAwareTrainingConfig, SparsityConfig, RtnConfig, - HQQConfig, + HqqConfig, AwqConfig, TeqConfig, GPTQConfig, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 0d5a246d1a7..bfa93da798e 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -796,7 +796,6 @@ def __init__( use_ggml: bool = False, use_quant: bool = True, use_neural_speed: bool = False, - llm_int8_skip_modules=None, **kwargs, ): self.quant_method = QuantizationMethod.RTN @@ -816,9 +815,7 @@ def __init__( self.double_quant_bits = double_quant_bits self.double_quant_use_sym = double_quant_use_sym self.double_quant_group_size = double_quant_group_size - self.llm_int8_skip_modules = ( - llm_int8_skip_modules if llm_int8_skip_modules else [] - ) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed @@ -846,7 +843,7 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict -class HQQConfig(ITREXQuantizationConfigMixin): +class HqqConfig(ITREXQuantizationConfigMixin): def __init__( self, bits: int = 4, @@ -874,6 +871,7 @@ def __init__( self.quant_scale = quant_scale self.scale_quant_group_size = scale_quant_group_size self.skip_lm_head = skip_lm_head + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) self.device = kwargs.get("device", "auto") self.use_ipex = kwargs.pop("use_ipex", False) @@ -916,14 +914,13 @@ def __init__( damp_percent: float = 0.1, desc_act: bool = False, n_samples: int = 128, - seq_len: Optional[int] = 2048, + seq_len: int = 2048, static_groups: bool = False, true_sequential: bool = False, layer_wise: bool = False, use_ggml: bool = False, use_quant: bool = True, use_neural_speed: bool = False, - llm_int8_skip_modules=None, **kwargs, ): @@ -951,9 +948,7 @@ def __init__( self.true_sequential = true_sequential self.layer_wise = layer_wise self.seq_len = seq_len - self.llm_int8_skip_modules = ( - llm_int8_skip_modules if llm_int8_skip_modules else [] - ) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed @@ -1024,7 +1019,7 @@ def __init__( scale_dtype: Any = None, layer_wise: bool = False, n_samples: int = 128, - seq_len: Optional[int] = 2048, + seq_len: int = 2048, auto_scale: bool = True, auto_clip: bool = True, use_double_quant=False, @@ -1033,7 +1028,6 @@ def __init__( use_ggml: bool = False, use_quant: bool = True, use_neural_speed: bool = False, - llm_int8_skip_modules=None, **kwargs, ): self.quant_method = QuantizationMethod.AWQ @@ -1052,9 +1046,7 @@ def __init__( self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.llm_int8_skip_modules = ( - llm_int8_skip_modules if llm_int8_skip_modules else [] - ) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) self.use_ggml = use_ggml self.use_quant = use_quant self.use_neural_speed = use_neural_speed @@ -1098,13 +1090,12 @@ def __init__( scale_dtype: Any = None, layer_wise: bool = False, n_samples: int = 128, - seq_len: Optional[int] = 2048, + seq_len: int = 2048, use_double_quant=False, double_quant_scale_dtype=None, # reserve for double quant sym: bool = True, use_ggml: bool = False, use_neural_speed: bool = False, - llm_int8_skip_modules=None, **kwargs, ): self.quant_method = QuantizationMethod.TEQ @@ -1122,9 +1113,7 @@ def __init__( self.seq_len = seq_len self.use_double_quant = use_double_quant self.double_quant_scale_dtype = double_quant_scale_dtype - self.llm_int8_skip_modules = ( - llm_int8_skip_modules if llm_int8_skip_modules else [] - ) + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) self.use_ggml = use_ggml self.use_neural_speed = use_neural_speed self.device = kwargs.get("device", "auto") @@ -1169,11 +1158,12 @@ def __init__( lr: float = None, minmax_lr: float = None, disable_quanted_input: bool = True, - n_samples: int = 512, + n_samples: int = 128, + seq_len: int = 2048, iters: int = 200, + quant_lm_head: bool = False, use_ggml: bool = False, use_neural_speed: bool = False, - llm_int8_skip_modules=None, **kwargs, ): @@ -1197,9 +1187,9 @@ def __init__( self.minmax_lr = minmax_lr self.disable_quanted_input = disable_quanted_input self.iters = iters - self.llm_int8_skip_modules = ( - llm_int8_skip_modules if llm_int8_skip_modules else [] - ) + self.seq_len = seq_len + self.quant_lm_head = quant_lm_head + self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) self.use_ggml = use_ggml self.use_neural_speed = use_neural_speed self.batch_size = kwargs.pop("batch_size", 8) From 02d575177c08c28650bfd1709f0d168a76c6d39d Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 2 Jun 2024 22:01:55 -0700 Subject: [PATCH 14/28] fix awq folding setting to True Signed-off-by: changwangss --- .../transformers/llm/quantization/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 573b71c9c36..a45cf647242 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -565,6 +565,7 @@ def convert_to_quantized_model(model, config, device="cpu"): use_layer_wise=config.layer_wise, use_auto_scale=config.auto_scale, use_auto_clip=config.auto_clip, + folding=True, ) quant_config.set_local(".*lm_head", AWQConfig(dtype="fp32")) quant_config.set_local(".*output_layer", AWQConfig(dtype="fp32")) From e8c0946ea37476fd6f4fb6e9980c9511dbb96d76 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 10 Jun 2024 23:13:35 -0700 Subject: [PATCH 15/28] rebase Signed-off-by: changwangss --- .../quantization/llm_quantization_recipes.md | 8 +- .../quantization/run_generation_sq.py | 21 -- .../transformers/llm/quantization/utils.py | 14 +- .../transformers/modeling/modeling_auto.py | 254 ------------------ .../transformers/utils/config.py | 5 - 5 files changed, 12 insertions(+), 290 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md index 037598c2d42..ce1764bc0bb 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md +++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md @@ -40,9 +40,11 @@ pip install -v . # install requirements cd examples/huggingface/pytorch/text-generation/quantization pip install -r requirements.txt -pip install neural-compressor==2.5 -pip install transformers==4.35.2 +pip install neural-compressor==2.6 pip install torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu +# 4.38.1 is only limited by smoothquant +pip install transformers==4.38.1 +# ipex is only necessary for smoothquant pip install intel-extension-for-pytorch==2.3.0 ``` @@ -738,7 +740,7 @@ python run_generation_cpu_woq.py \ --seq_len 2048 \ --scheme sym \ --group_size 32 \ - --nsamples 256 \ + --n_samples 256 \ --accuracy # int4 AutoRound diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index 3517a9c05d4..b32655bac12 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -183,33 +183,12 @@ from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( recover_model_from_json, ) -<<<<<<< HEAD user_model = recover_model_from_json( args.model, os.path.join(args.output_dir, "best_configure.json"), args.trust_remote_code, ) -======= - - if args.restore: - from intel_extension_for_transformers.transformers.utils.utility import ( - recover_model_from_json, - ) - user_model = recover_model_from_json( - args.model, - os.path.join(args.output_dir, "best_configure.json"), - args.trust_remote_code, - ) - else: - user_model = torch.jit.load(os.path.join( args.model, "best_model.pt")) - config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) - origin_model_type = config.model_type - if origin_model_type in ["chatglm", "qwen", "baichuan"]: - config.model_type = "qwen2" - user_model = TSModelCausalLMForITREX(user_model, config=config) - user_model.config.model_type = origin_model_type ->>>>>>> main elif not (args.sq or args.mixed_precision): user_model = AutoModelForCausalLM.from_pretrained( args.model, diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 534adcdbd58..afad1d516c2 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -45,16 +45,16 @@ ) from ...utils import CpuInfo -from .sq_utils import ( - IPEX_OPT_LLM_SUPPORTED, - MODEL_TYPES_REQUIRING_POSITION_IDS, - generate_dummy_past_key_values, - generate_dummy_past_key_values_for_opt_llm, - get_dataloader, -) if is_ipex_available(): import intel_extension_for_pytorch as ipex + from .sq_utils import ( + IPEX_OPT_LLM_SUPPORTED, + MODEL_TYPES_REQUIRING_POSITION_IDS, + generate_dummy_past_key_values, + generate_dummy_past_key_values_for_opt_llm, + get_dataloader, + ) if is_autoround_available(): from auto_round.export.export_to_itrex.model_wrapper import ( diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 186dbe2eaf1..89ab4f758ea 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -845,262 +845,8 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: ) and model.config.model_type == "chatglm": model = model.float() model.eval() -<<<<<<< HEAD logger.info("Applying SmoothQuant.") model = convert_to_smoothquant_model(model, quantization_config) -======= - model_type = model.config.model_type.replace("_", "-") - - logger.info("Applying SmoothQuant.") - # ipex.optimize_transformers - if quantization_config.ipex_opt_llm is None: - if model_type in IPEX_OPT_LLM_SUPPORTED: - quantization_config.ipex_opt_llm = True - logger.info( - "quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used." - ) - logger.warning("The suggested transformers version is 4.38.1.") - else: - quantization_config.ipex_opt_llm = False - if quantization_config.ipex_opt_llm: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5) - model = ipex.optimize_transformers( - model.eval(), - quantization_config=qconfig, - dtype=torch.float32, - inplace=True, - deployment_mode=False, - ) - model.eval() - - # past_key_values - num_beams = quantization_config.num_beams - if quantization_config.ipex_opt_llm: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=model.config, input_bs=1, num_beams=num_beams - ) - else: - past_key_values = generate_dummy_past_key_values( - config=model.config, input_bs=1 - ) - - # calibration function - calib_func = quantization_config.calib_func - tokenizer = quantization_config.tokenizer - if calib_func is None: - if quantization_config.tokenizer is None: - logger.error( - "Please provide the tokenizer or provide calib_func directly," - + " the following is how to get tokenizer. \n" - + " from transformer import AutoTokenizer \n" - + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - - from datasets import load_dataset - from torch.utils.data import DataLoader - - calib_dataset = quantization_config.calib_dataset - calib_shuffle = quantization_config.calib_shuffle - calib_iters = quantization_config.calib_iters - calib_padding = quantization_config.calib_padding - calib_len = quantization_config.calib_len - calib_pad_val = quantization_config.calib_pad_val - from torch.nn.functional import pad - - calib_dataset = load_dataset( - calib_dataset, - split=( - "test" - if calib_dataset in ["mbpp", "openai_humaneval"] - else "train" - ), - ) - if calib_shuffle: - calib_dataset = calib_dataset.shuffle(seed=42) - - def tokenize_function(examples): - if "code" in examples: - example = tokenizer(examples["code"]) - elif "prompt" in examples: - example = tokenizer(examples["prompt"]) - elif "text" in examples: - example = tokenizer(examples["text"]) - else: - logger.error( - "Please check dataset prompt identifier," - + " NeelNanda/pile-10k is default used calibration dataset." - ) - exit(0) - return example - - def collate_batch(batch): - position_ids_padded = [] - input_ids_padded = [] - last_ind = [] - attention_mask_padded = [] - for text in batch: - input_ids = text["input_ids"] - if not calib_padding: - input_ids = ( - input_ids[: int(calib_len)] - if len(input_ids) > int(calib_len) - else input_ids - ) # no_padding - else: - pad_len = calib_len - input_ids.shape[0] - input_ids = pad( - input_ids, (0, pad_len), value=calib_pad_val - ) - - last_ind.append(input_ids.shape[0] - 1) - if model_type in ["bloom"]: - attention_mask = torch.ones(len(input_ids) + 1) - attention_mask[0] = 0 - else: - attention_mask = torch.ones(len(input_ids)) - position_ids = torch.arange(len(input_ids)) - input_ids_padded.append(input_ids) - attention_mask_padded.append(attention_mask) - position_ids_padded.append(position_ids) - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - return ( - { - "input_ids": torch.vstack(input_ids_padded), - "attention_mask": torch.vstack(attention_mask_padded), - "position_ids": torch.vstack(position_ids_padded), - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - else: - return ( - { - "input_ids": torch.vstack(input_ids_padded), - "attention_mask": torch.vstack(attention_mask_padded), - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - - def collate_batch_for_chatglm(batch): - last_ind = [] - for text in batch: - input_ids = torch.vstack([text["input_ids"]]) - if re.search( - "THUDM/chatglm-6b", model.config.auto_map["AutoConfig"] - ): - input_ids = ( - input_ids[:, :calib_len] - if input_ids.shape[1] > calib_len - else input_ids - ) - eos = torch.tensor([130001, 130004]).repeat(1, 1) - input_ids = torch.cat((input_ids, eos), 1) - else: - input_ids = ( - input_ids[:, :calib_len] - if input_ids.shape[1] > calib_len - else input_ids - ) - prepared_inputs = model.prepare_inputs_for_generation(input_ids) - attention_mask = torch.ones_like(input_ids) - last_ind.append(input_ids.shape[1] - 1) - return ( - { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": prepared_inputs["position_ids"], - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - - tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - if model_type == "chatglm": - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch_for_chatglm, - ) - else: - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) - - def calib_func(model): - with torch.no_grad(): - for i, (inputs, last_ind) in enumerate(calib_dataloader): - if i >= calib_iters: - break - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - model( - input_ids=inputs["input_ids"], - past_key_values=inputs["past_key_values"], - position_ids=inputs["position_ids"], - attention_mask=inputs["attention_mask"], - ) - else: - model( - input_ids=inputs["input_ids"], - past_key_values=inputs["past_key_values"], - attention_mask=inputs["attention_mask"], - ) - - logger.info( - "The default calibration function is used, " - + "the calibration dataset is NeelNanda/pile-10k, " - + "batchsize is 1 and calibration iteration is 100." - ) - calib_func = calib_func - - # example_inputs - example_inputs = quantization_config.example_inputs - if example_inputs is None: - for i, (inputs, last_ind) in enumerate(calib_dataloader): - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - example_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "position_ids": inputs["position_ids"], - "past_key_values": inputs["past_key_values"], - } - else: - example_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "past_key_values": inputs["past_key_values"], - } - break - - # call inc sq - from neural_compressor import PostTrainingQuantConfig, quantization - - conf = PostTrainingQuantConfig( - backend=quantization_config.backend, # default is ipex - excluded_precisions=quantization_config.excluded_precisions, - op_type_dict=quantization_config.op_type_dict, - op_name_dict=quantization_config.op_name_dict, - recipes=quantization_config.recipes, - example_inputs=example_inputs, - ) - model = quantization.fit( - model, - conf, - calib_func=calib_func, - calib_dataloader=( - calib_dataloader - if quantization_config.recipes["smooth_quant_args"]["alpha"] - == "auto" - else None - ), - ) ->>>>>>> main logger.info("SmoothQuant done.") elif isinstance(quantization_config, DynamicQuantConfig): model = cls.ORIG_MODEL.from_pretrained( diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 8b1a3c5bfc3..4fec7d089c4 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -321,14 +321,9 @@ def post_init_cpu(self): "fp8_e4m3", ]: raise ValueError( -<<<<<<< HEAD - "weight_dtype must be a string in " - "'int8', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'" -======= f"weight_dtype must be a string in " f"'int8', 'int4', 'int4_clip', 'nf4', 'fp4', 'fp4_e2m1_bnb', 'fp4_e2m1', " f"'fp8', 'fp8_e5m2, fp8_e4m3'" ->>>>>>> main ) if self.scale_dtype is not None and self.scale_dtype not in [ From 658e129183d8e2754e7295b38e2df99167dd7c23 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 11 Jun 2024 00:01:40 -0700 Subject: [PATCH 16/28] fix extension Signed-off-by: changwangss --- .../text-generation/quantization/run_benchmark.sh | 3 --- .../pytorch/text-generation/quantization/run_tuning.sh | 10 +++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index bf72f1c8b7e..e74b7077be0 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -232,9 +232,6 @@ function run_benchmark { fi fi if [[ ${int8} == "true" ]] && [[ "$model_source" != "huggingface" ]]; then - # if [[ "${script}" == "run_generation_sq.py" ]] && [[ "${topology}" != "gpt_j_mp" ]];then - # extra_cmd=$extra_cmd" --int8" - # fi model_name_or_path=$tuned_checkpoint fi if [[ $backend == "neuralspeed" ]]; then diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index 16eaaa3182e..d2e77761da2 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -230,8 +230,8 @@ function run_tuning { script="run_generation_sq.py" elif [ "${topology}" = "llama2_7b_gptq" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq --bits ${bits} --compute_dtype fp32 --scheme ${scheme} --calib_iters 100" - extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 " + extra_cmd=$extra_cmd" --woq --bits ${bits} --compute_dtype fp32 --scheme ${scheme} --n_samples 100" + extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --seq_len 2048 " extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code" extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}" @@ -239,7 +239,7 @@ function run_tuning { elif [ "${topology}" = "mistral_7b_autoround" ]; then model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1" extra_cmd=$extra_cmd" --woq --bits ${bits} --compute_dtype fp32 --scheme ${scheme} " - extra_cmd=$extra_cmd" --woq_algo "AutoRound" --desc_act --group_size 128 --calib_len 2048 --calib_iters 100" + extra_cmd=$extra_cmd" --woq_algo "AutoRound" --desc_act --group_size 128 --seq_len 2048 --n_samples 100" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code" extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}" @@ -254,8 +254,8 @@ function run_tuning { script="run_generation_cpu_woq.py" elif [ "${topology}" = "mistral_7b_gptq" ]; then model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1" - extra_cmd=$extra_cmd" --woq --bits ${bits} --compute_dtype fp32 --scheme ${scheme} --calib_iters 100" - extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 --group_size 128" + extra_cmd=$extra_cmd" --woq --bits ${bits} --compute_dtype fp32 --scheme ${scheme} --n_samples 100" + extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --seq_len 2048 --group_size 128" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" extra_cmd=$extra_cmd" --trust_remote_code" extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}" From fe06a847b827a78a0fc907be46614984092aa8d7 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 11 Jun 2024 00:13:31 -0700 Subject: [PATCH 17/28] support extension Signed-off-by: changwangss --- .../quantization/llm_quantization_recipes.md | 26 +++++++++---------- .../quantization/run_benchmark.sh | 4 +-- .../quantization/run_generation_sq.py | 17 +++++++----- .../transformers/utils/config.py | 1 + 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md index ce1764bc0bb..f3c697d0680 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md +++ b/examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md @@ -61,7 +61,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.85 ``` @@ -115,7 +115,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.9 ``` @@ -169,7 +169,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.5 ``` @@ -340,7 +340,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.8 ``` @@ -394,7 +394,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.9 ``` @@ -500,7 +500,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.95 ``` @@ -554,7 +554,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.65 ``` @@ -662,7 +662,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.75 ``` @@ -715,7 +715,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.9 ``` @@ -768,7 +768,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.6 ``` @@ -821,7 +821,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.7 ``` @@ -874,7 +874,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.75 ``` @@ -927,7 +927,7 @@ python run_generation_sq.py \ --tasks lambada_openai \ --sq \ --accuracy \ - --batch_size 56 \ + --eval_batch_size 56 \ --alpha 0.75 ``` diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index e74b7077be0..be12520c902 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -73,7 +73,7 @@ function run_benchmark { extra_cmd=$extra_cmd" --tasks ${lm_eval_tasks}" elif [[ ${mode} == "benchmark" ]]; then mode_cmd=" --benchmark " - extra_cmd=$extra_cmd" --iters ${iters}" + extra_cmd=$extra_cmd" --benchmark_iters ${iters}" else echo "Error: No such mode: ${mode}" exit 1 @@ -248,7 +248,7 @@ function run_benchmark { elif [ "${script}" == "run_generation_cpu_woq.py" ];then python -u ./${script} \ --model ${model_name_or_path} \ - --batch_size ${batch_size} \ + --eval_batch_size ${batch_size} \ ${mode_cmd} \ ${extra_cmd} else diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index b32655bac12..7b34ea720f7 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -43,11 +43,12 @@ ) # ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") -parser.add_argument("--iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_iters", default=100, type=int, help="num iter") +parser.add_argument("--benchmark_batch_size", default=1, type=int, help="batch size for benchmark") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") # ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=56, type=int, help="batch size num.") +parser.add_argument("--eval_batch_size", default=56, type=int, help="batch size num.") parser.add_argument( "--tasks", default="lambada_openai", @@ -65,6 +66,7 @@ parser.add_argument( "--seq_len", default=512, type=int, help="Smooth quant calibration input length." ) +parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") # sq alpha "auto" parameters parser.add_argument("--scale_sharing", action="store_true") parser.add_argument( @@ -138,6 +140,7 @@ tokenizer=tokenizer, seq_len=args.seq_len, n_samples=args.n_samples, + batch_size=args.batch_size, excluded_precisions=excluded_precisions, alpha=args.alpha if args.alpha == "auto" else float(args.alpha), scale_sharing=args.scale_sharing, @@ -205,7 +208,7 @@ # start total_time = 0.0 - num_iter = args.iters + num_iter = args.benchmark_iters num_warmup = args.num_warmup total_token_num = 0 eos_token_id = tokenizer.eos_token_id @@ -215,7 +218,7 @@ # for chatglm2 only if hasattr(tokenizer, "build_chat_input"): input_ids = tokenizer.build_chat_input(prompt)["input_ids"] - input_ids = input_ids.repeat(args.batch_size, 1) + input_ids = input_ids.repeat(args.benchmark_batch_size, 1) eos_token_id = [ tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), @@ -225,11 +228,11 @@ elif hasattr(tokenizer, "build_prompt"): build_prompt = tokenizer.build_prompt(prompt) input_ids = tokenizer( - [build_prompt] * args.batch_size, return_tensors="pt" + [build_prompt] * args.benchmark_batch_size, return_tensors="pt" ).input_ids else: input_ids = tokenizer( - [prompt] * args.batch_size, return_tensors="pt" + [prompt] * args.benchmark_batch_size, return_tensors="pt" ).input_ids gen_ids = user_model.generate( input_ids, @@ -270,7 +273,7 @@ user_model=user_model, tasks=args.tasks, device="cpu", - batch_size=args.batch_size, + batch_size=args.eval_batch_size, ) results = evaluate(args) for task_name in args.tasks.split(","): diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 4fec7d089c4..3f6da8c8a05 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -780,6 +780,7 @@ def __init__( self.ipex_opt_llm = ipex_opt_llm self.num_beams = num_beams self.excluded_precisions = excluded_precisions + self.batch_size = kwargs.pop("batch_size", 1) class RtnConfig(ITREXQuantizationConfigMixin): From 17287f870528a4cdaff74acf9ffd86cb580a2796 Mon Sep 17 00:00:00 2001 From: changwangss Date: Tue, 11 Jun 2024 00:24:07 -0700 Subject: [PATCH 18/28] fix benchmark Signed-off-by: changwangss --- .../pytorch/text-generation/quantization/run_benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index be12520c902..bf77c9ece9a 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -71,9 +71,11 @@ function run_benchmark { if [[ ${mode} == "accuracy" ]]; then mode_cmd=" --accuracy " extra_cmd=$extra_cmd" --tasks ${lm_eval_tasks}" + extra_cmd=$extra_cmd" --eval_batch_size ${batch_size}" elif [[ ${mode} == "benchmark" ]]; then mode_cmd=" --benchmark " extra_cmd=$extra_cmd" --benchmark_iters ${iters}" + extra_cmd=$extra_cmd" --benchmark_batch_size ${batch_size}" else echo "Error: No such mode: ${mode}" exit 1 @@ -242,13 +244,11 @@ function run_benchmark { if [ "${script}" == "run_generation_sq.py" ];then python -u ./${script} \ --model ${model_name_or_path} \ - --batch_size ${batch_size} \ ${mode_cmd} \ ${extra_cmd} elif [ "${script}" == "run_generation_cpu_woq.py" ];then python -u ./${script} \ --model ${model_name_or_path} \ - --eval_batch_size ${batch_size} \ ${mode_cmd} \ ${extra_cmd} else From 91c973b89e01a1d4893b1e0b0b143f8a695cacb8 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 13 Jun 2024 19:37:18 -0700 Subject: [PATCH 19/28] fix pylint Signed-off-by: changwangss --- .../neural_chat/models/model_utils.py | 2 +- .../transformers/llm/quantization/sq_utils.py | 5 +- .../transformers/llm/quantization/utils.py | 12 +- .../transformers/modeling/modeling_auto.py | 439 ++++++++++++------ .../transformers/utils/utility.py | 412 +--------------- 5 files changed, 306 insertions(+), 564 deletions(-) diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py index dd0c2c99102..9c3e837c7d9 100644 --- a/intel_extension_for_transformers/neural_chat/models/model_utils.py +++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py @@ -699,7 +699,7 @@ def load_model( assert ipex.__version__ >= "2.1.0+cpu", "Please use Intel Extension for PyTorch >=2.1.0+cpu." if re.search("falcon", model_name, re.IGNORECASE): assert transformers.__version__ <= "4.33.3", "Please pip install transformers==4.33.3" - from intel_extension_for_transformers.transformers.llm.evaluation.models import TSModelCausalLMForITREX + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import TSModelCausalLMForITREX model = TSModelCausalLMForITREX.from_pretrained( model_name, file_name="best_model.pt" diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index 1ffb7b47001..634ea7499c6 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -14,12 +14,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re + from typing import Optional, Tuple import transformers from datasets import load_dataset -from optimum.intel.generation.modeling import TSModelForCausalLM from torch.nn.functional import pad from torch.utils.data import DataLoader from transformers.modeling_outputs import CausalLMOutputWithPast @@ -315,7 +314,7 @@ def collate_batch(batch): ) return calib_dataloader - +from optimum.intel.generation.modeling import TSModelForCausalLM class TSModelCausalLMForITREX(TSModelForCausalLM): def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index afad1d516c2..4a24dc7121d 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -57,9 +57,7 @@ ) if is_autoround_available(): - from auto_round.export.export_to_itrex.model_wrapper import ( - WeightOnlyLinear as auto_round_woqlinear, - ) # pylint: disable=E0401 + from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woqlinear # pylint: disable=E0401 from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader torch = LazyImport("torch") @@ -299,10 +297,8 @@ def _replace_linear( use_optimum_format=use_optimum_format, ) elif device == "xpu" or device == torch.device("xpu"): - from intel_extension_for_pytorch.nn.utils._quantize_convert import ( - WeightOnlyQuantizedLinear as ipex_linear, - ) # pylint: disable=E0401 - + from intel_extension_for_pytorch.nn.utils._quantize_convert import \ + WeightOnlyQuantizedLinear as ipex_linear # pylint: disable=E0401 model._modules[name] = ipex_linear( in_features, out_features, @@ -569,6 +565,8 @@ def convert_to_quantized_model(model, config, device="cpu"): ) model = prepare(model, quant_config) model = convert(model) + # qits module doesn't match with HQQ algorithm. + return model elif config.quant_method.value == "awq": quant_config = AWQConfig( dtype=dtype, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 89ab4f758ea..28dc9715782 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -164,7 +164,11 @@ def build_woq_model(model, quantization_config): if "lm_head" in n or "output_layer" in n or "embed_out" in n: continue if isinstance(m, torch.nn.Linear): - zp = getattr(quantization_config, "zero_point", not getattr(quantization_config, "sym", False)) + zp = getattr( + quantization_config, + "zero_point", + not getattr(quantization_config, "sym", False), + ) with init_empty_weights(): new_module = WeightOnlyLinear( m.in_features, @@ -201,6 +205,7 @@ def convert_model_to_public(model): ]: model = recover_export_model(model) + def make_contiguous(model): for param in model.parameters(): if param.data.ndimension() > 1: @@ -225,7 +230,8 @@ def save_low_bit( self.model.config.quantization_config = self.quantization_config self.model.config.save_pretrained(save_directory) weights_file = os.path.join( - os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME + ) torch.save(self.quantized_state_dict(), weights_file) return @@ -239,25 +245,42 @@ def save_low_bit( ) if self.quantization_config.use_ipex: + def save_linear_parameters(model, save_directory): # only can save to pytorch model.bin due to ipex. weights_file = os.path.join( - os.path.abspath(os.path.expanduser(save_directory)), SAFE_WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(save_directory)), SAFE_WEIGHTS_NAME + ) os.remove(weights_file) weights_file = os.path.join( - os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(save_directory)), WEIGHTS_NAME + ) linear_parameters = {} - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_cpu_linear, + ) + for name, module in model.named_modules(): if isinstance(module, ipex_cpu_linear): - linear_parameters[name + ".ipex_scales"] = module._op_context.get_scales().contiguous() - linear_parameters[name + ".ipex_weight"] = \ - module._op_context.to_public(module._op_context.get_weight()).contiguous() - linear_parameters[name + ".ipex_zeros"] = module._op_context.get_zero_points().contiguous() + linear_parameters[name + ".ipex_scales"] = ( + module._op_context.get_scales().contiguous() + ) + linear_parameters[name + ".ipex_weight"] = ( + module._op_context.to_public( + module._op_context.get_weight() + ).contiguous() + ) + linear_parameters[name + ".ipex_zeros"] = ( + module._op_context.get_zero_points().contiguous() + ) if module._op_context.get_bias() is not None: - linear_parameters[name + ".ipex_bias"] = module._op_context.get_bias().contiguous() + linear_parameters[name + ".ipex_bias"] = ( + module._op_context.get_bias().contiguous() + ) if module._op_context.get_g_idx() is not None: - linear_parameters[name + ".ipex_g_idx"] = module._op_context.get_g_idx().contiguous() + linear_parameters[name + ".ipex_g_idx"] = ( + module._op_context.get_g_idx().contiguous() + ) others_parameters = model.state_dict() linear_parameters.update(others_parameters) @@ -346,17 +369,27 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): use_vllm = kwargs.pop("use_vllm", None) if use_vllm is not None: logger.info("The backend is vLLM.") - from vllm import LLM # pylint: disable=E1101 - from vllm.model_executor.model_loader import get_model_loader # pylint: disable=E0611 - from vllm.model_executor.model_loader.weight_utils import default_weight_loader # pylint: disable=E0401 disable=E0611 - from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - ColumnParallelLinear, - RowParallelLinear) # pylint: disable=E1101 + from vllm import LLM # pylint: disable=E1101 + from vllm.model_executor.model_loader import ( + get_model_loader, + ) # pylint: disable=E0611 + from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + ) # pylint: disable=E0401 disable=E0611 + from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + ColumnParallelLinear, + RowParallelLinear, + ) # pylint: disable=E1101 os.environ["backend"] = "use_vllm" - llm = LLM(model=pretrained_model_name_or_path, trust_remote_code=True) # Create an vllm instance. - model = llm.llm_engine.model_executor.driver_worker.model_runner.model # pylint: disable=E1101 + llm = LLM( + model=pretrained_model_name_or_path, trust_remote_code=True + ) # Create an vllm instance. + model = ( + llm.llm_engine.model_executor.driver_worker.model_runner.model + ) # pylint: disable=E1101 print("Original model =", model) original_parameter_memo = dict() @@ -366,12 +399,22 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if "qkv_proj" in name or "gate_up_proj" in name: input_dim = getattr(params, "input_dim", None) output_dim = getattr(params, "output_dim", None) - original_parameter_memo[name] = (input_dim, output_dim, params.weight_loader) + original_parameter_memo[name] = ( + input_dim, + output_dim, + params.weight_loader, + ) class linear_adaptor(torch.nn.Linear): - def __init__(self, in_features: int, out_features: int, bias: bool = True, \ - device=None, dtype=None) -> None: + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: super().__init__(in_features, out_features, bias, device, dtype) def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: @@ -379,34 +422,49 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: for name, module in model.named_modules(): bias_flag = False - if isinstance(module, QKVParallelLinear) or isinstance(module, MergedColumnParallelLinear) or \ - isinstance(module, RowParallelLinear) or isinstance(module, ColumnParallelLinear): + if ( + isinstance(module, QKVParallelLinear) + or isinstance(module, MergedColumnParallelLinear) + or isinstance(module, RowParallelLinear) + or isinstance(module, ColumnParallelLinear) + ): out_feature = module.weight.shape[0] in_feature = module.weight.shape[1] if getattr(module, "bias", False) != None: bias_flag = True weight_dtype = module.weight.dtype - torch_linear = linear_adaptor(in_features=in_feature, - out_features=out_feature, - bias=bias_flag, - dtype=weight_dtype) + torch_linear = linear_adaptor( + in_features=in_feature, + out_features=out_feature, + bias=bias_flag, + dtype=weight_dtype, + ) module_traversal = model - all_module_names = name.split('.') + all_module_names = name.split(".") all_module_names_except_last = all_module_names[:-1] for sub_module_name in all_module_names_except_last: module_traversal = module_traversal._modules[sub_module_name] - module_traversal._modules[all_module_names[-1]] = copy.deepcopy(torch_linear) + module_traversal._modules[all_module_names[-1]] = copy.deepcopy( + torch_linear + ) print("Optimized model =", model) - loader = get_model_loader(llm.llm_engine.load_config) # pylint: disable=E1101 + loader = get_model_loader( + llm.llm_engine.load_config + ) # pylint: disable=E1101 + + weights_iterator = loader._get_weights_iterator( + llm.llm_engine.model_config.model, + llm.llm_engine.model_config.revision, + fall_back_to_pt=True, + ) - weights_iterator = loader._get_weights_iterator(llm.llm_engine.model_config.model, - llm.llm_engine.model_config.revision, - fall_back_to_pt=True) + from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + ) # pylint: disable=E0401 disable=E0611 - from vllm.model_executor.model_loader.weight_utils import default_weight_loader # pylint: disable=E0401 disable=E0611 params_dict = dict(model.named_parameters(remove_duplicate=False)) for name in params_dict.keys(): params = params_dict[name] @@ -424,11 +482,13 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: print("INC quantizing...") config = kwargs.pop("config", None) if config is None: - config = RtnConfig(compute_dtype="int8", - group_size=128, - scale_dtype="bf16", - weight_dtype="int4_clip", - bits=4) + config = RtnConfig( + compute_dtype="int8", + group_size=128, + scale_dtype="bf16", + weight_dtype="int4_clip", + bits=4, + ) print("using default RTNConfig = ", config) print("Using customized config = ", config) model = convert_to_quantized_model(model, config) @@ -489,8 +549,12 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: return model device_map = kwargs.get("device_map", "cpu") - use_cpu = True if device_map == torch.device("cpu") or device_map == "cpu" else False - use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False + use_cpu = ( + True if device_map == torch.device("cpu") or device_map == "cpu" else False + ) + use_xpu = ( + True if device_map == torch.device("xpu") or device_map == "xpu" else False + ) config = kwargs.pop("config", None) model_hub = kwargs.pop("model_hub", "huggingface") @@ -498,20 +562,28 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: quantization_config = kwargs.pop("quantization_config", None) if not isinstance(config, PretrainedConfig): if model_hub == "modelscope": - import modelscope # pylint: disable=E0401 - config = modelscope.AutoConfig.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=True) + import modelscope # pylint: disable=E0401 + + config = modelscope.AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=True + ) else: config, _ = AutoConfig.from_pretrained( pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs, - ) - if quantization_config is not None and quantization_config.quant_method in ["sq"]: + if quantization_config is not None and quantization_config.quant_method in [ + "sq" + ]: use_neural_speed = False - elif hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and "quant_method" in config.quantization_config and config.quantization_config["quant_method"] in ["sq"]: + elif ( + hasattr(config, "quantization_config") + and isinstance(config.quantization_config, dict) + and "quant_method" in config.quantization_config + and config.quantization_config["quant_method"] in ["sq"] + ): use_neural_speed = False elif kwargs.get("use_llm_runtime", None) is not None: use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu @@ -544,30 +616,38 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: "Quantization_config loading failed. If you want to load saved " "low bit model, please check your quantizate_config.json." ) - elif use_neural_speed and not config.quantization_config["quant_method"] in ["dynamic", "static", "qat"]: + elif use_neural_speed and not config.quantization_config[ + "quant_method" + ] in ["dynamic", "static", "qat"]: if not os.path.exists(pretrained_model_name_or_path): from huggingface_hub import snapshot_download - pretrained_model_name_or_path = snapshot_download(repo_id=pretrained_model_name_or_path, - allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"], - ) + + pretrained_model_name_or_path = snapshot_download( + repo_id=pretrained_model_name_or_path, + allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"], + ) if quantization_config is None: - ConfigInit = {"rtn": RtnConfig, - "awq": AwqConfig, - "teq": TeqConfig, - "gptq": GPTQConfig, - "autoround": AutoRoundConfig, - } + ConfigInit = { + "rtn": RtnConfig, + "awq": AwqConfig, + "teq": TeqConfig, + "gptq": GPTQConfig, + "autoround": AutoRoundConfig, + } quantization_config = config.quantization_config - assert quantization_config.get("quant_method", None) in ConfigInit, \ - "Detect this model is not a low-bit model." - quantization_config = ConfigInit[quantization_config["quant_method"]].from_dict(quantization_config) + assert ( + quantization_config.get("quant_method", None) in ConfigInit + ), "Detect this model is not a low-bit model." + quantization_config = ConfigInit[ + quantization_config["quant_method"] + ].from_dict(quantization_config) logger.info("Loading Low Bits model by Neural Speed.") quantization_config.post_init_runtime() from neural_speed import Model model = Model() - model.init( # pylint: disable=E1123 + model.init( # pylint: disable=E1123 pretrained_model_name_or_path, weight_dtype=quantization_config.weight_dtype, alg=quantization_config.scheme, @@ -658,9 +738,15 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: else: quantization_config = RtnConfig( bits=4, - compute_dtype=torch.float32 if - (use_cpu and not CpuInfo().bf16 - and torch_dtype == torch.bfloat16) else convert_dtype_torch2str(torch_dtype), + compute_dtype=( + torch.float32 + if ( + use_cpu + and not CpuInfo().bf16 + and torch_dtype == torch.bfloat16 + ) + else convert_dtype_torch2str(torch_dtype) + ), weight_dtype="nf4" if use_cpu else "int4_fullrange", ) else: @@ -674,14 +760,21 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: if quantization_config is None: if use_neural_speed: quantization_config = RtnConfig( - compute_dtype="bf16" if CpuInfo().bf16 else "fp32", weight_dtype="int8" + compute_dtype="bf16" if CpuInfo().bf16 else "fp32", + weight_dtype="int8", ) else: quantization_config = RtnConfig( bits=8, - compute_dtype=torch.float32 if - (use_cpu and not CpuInfo().bf16 - and torch_dtype == torch.bfloat16) else convert_dtype_torch2str(torch_dtype), + compute_dtype=( + torch.float32 + if ( + use_cpu + and not CpuInfo().bf16 + and torch_dtype == torch.bfloat16 + ) + else convert_dtype_torch2str(torch_dtype) + ), weight_dtype="int8", ) else: @@ -731,7 +824,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: from neural_speed import Model model = Model() - model.init( # pylint: disable=E1123 + model.init( # pylint: disable=E1123 pretrained_model_name_or_path, weight_dtype=quantization_config.weight_dtype, alg=quantization_config.scheme, @@ -990,7 +1083,6 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: # torch.tensor(last_ind), # ) - # tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) # tokenized_dataset.set_format(type="torch", columns=["input_ids"]) # calib_dataloader = DataLoader( @@ -1014,7 +1106,6 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: # ) # calib_func = calib_func - # # call inc static quant # from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare # quant_config = StaticQuantConfig( @@ -1130,7 +1221,6 @@ def collate_batch(batch): torch.tensor(last_ind), ) - tokenized_dataset = train_dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids"]) train_dataloader = DataLoader( @@ -1157,7 +1247,7 @@ def train_func(model): optimizer.zero_grad() loss.backward() optimizer.step() - print('Iteration [{}], Loss: {:.4f}'.format(i+1, loss)) + print("Iteration [{}], Loss: {:.4f}".format(i + 1, loss)) return model logger.info( @@ -1170,6 +1260,7 @@ def train_func(model): # call inc static quant from neural_compressor import QuantizationAwareTrainingConfig, quantization from neural_compressor.training import prepare_compression + conf = QuantizationAwareTrainingConfig( backend=quantization_config.backend, excluded_precisions=quantization_config.excluded_precisions, @@ -1181,7 +1272,9 @@ def train_func(model): model = compression_manager.model train_func(model) compression_manager.callbacks.on_train_end() - compression_manager.model.save_pretrained = types.MethodType(save_low_bit, model) + compression_manager.model.save_pretrained = types.MethodType( + save_low_bit, model + ) quantization_config.remove_redundant_parameters() compression_manager.model.quantization_config = quantization_config logger.info("Quant Aware Training done.") @@ -1192,7 +1285,7 @@ def train_func(model): from neural_speed import Model model = Model() - model.init( # pylint: disable=E1123 + model.init( # pylint: disable=E1123 pretrained_model_name_or_path, weight_dtype="fp32", use_quant=False, @@ -1273,7 +1366,11 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): kwarg_attn_imp = kwargs.pop("attn_implementation", None) # lm-eval device map is dictionary - device_map = device_map[""] if isinstance(device_map, dict) and "" in device_map else device_map + device_map = ( + device_map[""] + if isinstance(device_map, dict) and "" in device_map + else device_map + ) if use_safetensors is None and not is_safetensors_available(): use_safetensors = False @@ -1289,8 +1386,12 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) token = use_auth_token - use_cpu = True if device_map == torch.device("cpu") or device_map == "cpu" else False - use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False + use_cpu = ( + True if device_map == torch.device("cpu") or device_map == "cpu" else False + ) + use_xpu = ( + True if device_map == torch.device("xpu") or device_map == "xpu" else False + ) user_agent = { "file_type": "model", @@ -1321,7 +1422,9 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif quantization_config["quant_method"] == "dynamic": quantization_config = DynamicQuantConfig.from_dict(quantization_config) elif quantization_config["quant_method"] == "qat": - quantization_config = QuantAwareTrainingConfig.from_dict(quantization_config) + quantization_config = QuantAwareTrainingConfig.from_dict( + quantization_config + ) elif quantization_config["quant_method"] == "sq": quantization_config = SmoothQuantConfig.from_dict(quantization_config) assert ( @@ -1462,11 +1565,15 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): "_raise_exceptions_for_missing_entries": False, "_commit_hash": commit_hash, } - resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None # result when internet is up, the repo and revision exist, but the file does not. - if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + if resolved_archive_file is None and filename == _add_variant( + SAFE_WEIGHTS_NAME, variant + ): # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( pretrained_model_name_or_path, @@ -1487,9 +1594,13 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # This repo has no safetensors file of any kind, we switch to PyTorch. filename = _add_variant(WEIGHTS_NAME, variant) resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs + pretrained_model_name_or_path, + filename, + **cached_file_kwargs, ) - if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant): + if resolved_archive_file is None and filename == _add_variant( + WEIGHTS_NAME, variant + ): # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( pretrained_model_name_or_path, @@ -1508,7 +1619,9 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): "token": token, } if variant is not None and has_file( - pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs + pretrained_model_name_or_path, + WEIGHTS_NAME, + **has_file_kwargs, ): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named" @@ -1571,8 +1684,11 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config.quant_method in ["static", "dynamic", "qat"]: model = model_class(config, *model_args, **kwargs) from neural_compressor.utils.pytorch import load + weights_file = os.path.join( - os.path.abspath(os.path.expanduser(pretrained_model_name_or_path)), WEIGHTS_NAME) + os.path.abspath(os.path.expanduser(pretrained_model_name_or_path)), + WEIGHTS_NAME, + ) q_model = load(weights_file, model, dataloader=None) del model return q_model @@ -1581,7 +1697,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import ( TSModelCausalLMForITREX, ) - q_model = torch.jit.load(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)) + + q_model = torch.jit.load( + os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + ) origin_model_type = config.model_type if origin_model_type in ["chatglm", "qwen", "baichuan"]: config.model_type = "qwen2" @@ -1611,19 +1730,25 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): dtype_orig = model_class._set_default_torch_dtype(torch_dtype) if quantization_config.compute_dtype is None: if use_xpu: - quantization_config.compute_dtype = \ - "fp16" if (torch_dtype is None or - torch_dtype == torch.bfloat16) \ + quantization_config.compute_dtype = ( + "fp16" + if (torch_dtype is None or torch_dtype == torch.bfloat16) else convert_dtype_torch2str(torch_dtype) + ) else: - quantization_config.compute_dtype = \ - "fp32" if (torch_dtype is None or - (not CpuInfo().bf16 and torch_dtype == torch.bfloat16) or - (torch_dtype == torch.float16)) \ + quantization_config.compute_dtype = ( + "fp32" + if ( + torch_dtype is None + or (not CpuInfo().bf16 and torch_dtype == torch.bfloat16) + or (torch_dtype == torch.float16) + ) else convert_dtype_torch2str(torch_dtype) + ) else: - if ((not CpuInfo().bf16 and quantization_config.compute_dtype == "bf16") - or (use_cpu and quantization_config.compute_dtype == "fp16")): + if (not CpuInfo().bf16 and quantization_config.compute_dtype == "bf16") or ( + use_cpu and quantization_config.compute_dtype == "fp16" + ): quantization_config.compute_dtype = "fp32" if quantization_config.scale_dtype is None: @@ -1631,7 +1756,9 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config.scale_dtype not in ["fp32", "fp16", "bf16"]: logger.warning("scale_dtype only supports fp32, bf16, fp16.") quantization_config.scale_dtype = "fp32" - logger.warning("fp32 scale_dtype is used, please change the config.json if you don't want to use it.") + logger.warning( + "fp32 scale_dtype is used, please change the config.json if you don't want to use it." + ) # weight dtype is higher priority than bits in config.json when both existed. if quantization_config.weight_dtype is None: @@ -1639,36 +1766,47 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config.weight_dtype = "int4_clip" logger.info( "{} quantization weight_dtype is used due to bits is 4 in config.json.".format( - quantization_config.weight_dtype) + quantization_config.weight_dtype ) + ) elif quantization_config.bits == 8: quantization_config.weight_dtype = "int8" logger.info( "{} quantization weight_dtype is used due to bits is 8 in config.json.".format( - quantization_config.weight_dtype) + quantization_config.weight_dtype ) + ) else: logger.warning("bits number only supports 4, 8.") quantization_config.weight_dtype = "int4_clip" logger.warning( - "int4_clip weight_dtype is used, please change the config.json if you don't want to use it.") + "int4_clip weight_dtype is used, please change the config.json if you don't want to use it." + ) else: - if quantization_config.weight_dtype not in ["int4_fullrange", - "int4_clip", - "int8", - "fp8_e5m2", - "fp8_e4m3", - "nf4", - "fp4_e2m1_bnb", - "fp4_e2m1"]: - logger.warning("Please provide the correct bits number or weight_dtype in config.json.") + if quantization_config.weight_dtype not in [ + "int4_fullrange", + "int4_clip", + "int8", + "fp8_e5m2", + "fp8_e4m3", + "nf4", + "fp4_e2m1_bnb", + "fp4_e2m1", + ]: + logger.warning( + "Please provide the correct bits number or weight_dtype in config.json." + ) raise ValueError( f"weight_dtype must be a string in " f"'int8', 'int4', 'int4_fullrange', 'int4_clip', 'nf4', " f"'fp4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8', 'fp8_e5m2, fp8_e4m3'" ) else: - logger.info("{} quantization weight_dtype is used.".format(quantization_config.weight_dtype)) + logger.info( + "{} quantization weight_dtype is used.".format( + quantization_config.weight_dtype + ) + ) init_contexts = [no_init_weights(_enable=_fast_init)] init_contexts.append(init_empty_weights()) @@ -1706,7 +1844,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if is_ipex_available() and quantization_config.use_ipex: import intel_extension_for_pytorch as ipex - from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_linear + from intel_extension_for_pytorch.nn.modules import ( + WeightOnlyQuantizedLinear as ipex_linear, + ) + def replace_ipex_cpu_woq_linear(model, current_name=[]): for name, module in model.named_children(): current_name.append(name) @@ -1716,37 +1857,46 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): 8: ipex.quantization.WoqWeightDtype.INT8, } compute_dtype = { - "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. + "fp32": ipex.quantization.WoqLowpMode.NONE, # follow the activation datatype. "bf16": ipex.quantization.WoqLowpMode.BF16, "fp16": ipex.quantization.WoqLowpMode.FP16, "int8": ipex.quantization.WoqLowpMode.INT8, - } - ipex_qconfig_mapping = ( - ipex.quantization.get_weight_only_quant_qconfig_mapping( - weight_dtype=weight_dtype[quantization_config.bits], - lowp_mode=compute_dtype[quantization_config.compute_dtype], - act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, - group_size=quantization_config.group_size, - ) + ipex_qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype[quantization_config.bits], + lowp_mode=compute_dtype[quantization_config.compute_dtype], + act_quant_mode=ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + group_size=quantization_config.group_size, ) tmp_linear = torch.nn.Linear( module.in_features, module.out_features, - True if hasattr(module, "bias") else False - ) + True if hasattr(module, "bias") else False, + ) tmp_linear.qconfig = ipex_qconfig_mapping.global_qconfig target_linear = ipex_linear.from_float_and_int4_weight( - mod = tmp_linear, - qweight = state_dict.pop('.'.join(current_name) + ".ipex_weight"), - scales = state_dict.pop('.'.join(current_name) + ".ipex_scales"), - zero_points = state_dict.pop('.'.join(current_name) + ".ipex_zeros"), - bias = state_dict.pop('.'.join(current_name) + ".ipex_bias") \ - if '.'.join(current_name) + ".ipex_bias" in state_dict else None, - group_size = quantization_config.group_size, - g_idx = state_dict.pop('.'.join(current_name) + ".ipex_g_idx") \ - if '.'.join(current_name) + ".ipex_g_idx" in state_dict else None, + mod=tmp_linear, + qweight=state_dict.pop( + ".".join(current_name) + ".ipex_weight" + ), + scales=state_dict.pop( + ".".join(current_name) + ".ipex_scales" + ), + zero_points=state_dict.pop( + ".".join(current_name) + ".ipex_zeros" + ), + bias=( + state_dict.pop(".".join(current_name) + ".ipex_bias") + if ".".join(current_name) + ".ipex_bias" in state_dict + else None + ), + group_size=quantization_config.group_size, + g_idx=( + state_dict.pop(".".join(current_name) + ".ipex_g_idx") + if ".".join(current_name) + ".ipex_g_idx" in state_dict + else None + ), ) setattr(model, name, target_linear) else: @@ -1783,14 +1933,18 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): # Set model in evaluation mode to deactivate DropOut modules by default model.eval() - if quantization_config.weight_dtype not in [ - "fp8_e5m2", - "fp8_e4m3", - "nf4", - "fp4_e2m1", - "fp4_e2m1_bnb", - "int4_fullrange", - ] and not quantization_config.use_ipex: + if ( + quantization_config.weight_dtype + not in [ + "fp8_e5m2", + "fp8_e4m3", + "nf4", + "fp4_e2m1", + "fp4_e2m1_bnb", + "int4_fullrange", + ] + and not quantization_config.use_ipex + ): model = replace_linear( model, quantization_config=quantization_config, @@ -1798,8 +1952,9 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): empty_weights=True, ) - if (not use_xpu and torch_dtype == torch.float16) or (not use_xpu and not CpuInfo().bf16 - and torch_dtype == torch.bfloat16): + if (not use_xpu and torch_dtype == torch.float16) or ( + not use_xpu and not CpuInfo().bf16 and torch_dtype == torch.bfloat16 + ): model.to(dtype=torch.float32) # If it is a model with generation capabilities, attempt to load the generation config diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 2467531fab2..092a3a33a58 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -18,9 +18,7 @@ import argparse import os -from typing import Optional, Tuple -from neural_compressor.utils import logger -from neural_compressor.utils.utility import LazyImport, CpuInfo +from neural_compressor.utils.utility import LazyImport from intel_extension_for_transformers.tools.utils import is_ipex_available @@ -96,411 +94,3 @@ def __init__(self) -> None: self.dataset = dataloader.dataset return INCDataLoader() - - -def generate_dummy_past_key_values(config, input_bs): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 0, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 0, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 0, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - if config.model_type == "bloom": - shape_key = (input_bs * num_attention_heads, d_k, 1) - shape_value = (input_bs * num_attention_heads, 1, d_k) - key = torch.ones(size=shape_key) - value = torch.ones(size=shape_value) - past_key_values = tuple( - tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) - for _ in range(num_layers) - ) - return past_key_values - elif config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - return past_key_values - elif config.model_type == "falcon": - new_shape = [input_bs, 1, 0, d_k] - else: - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -def generate_dummy_past_key_values_for_inference(config, input_bs): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 0, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 0, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 0, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - if config.model_type == "bloom": - shape_key = (input_bs * num_attention_heads, d_k, 0) - shape_value = (input_bs * num_attention_heads, 0, d_k) - key = torch.empty(size=shape_key) - value = torch.empty(size=shape_value) - past_key_values = tuple( - tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) - for _ in range(num_layers) - ) - return past_key_values - elif config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - return past_key_values - elif config.model_type == "falcon": - new_shape = [input_bs, 1, 0, d_k] - else: - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 1, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 1, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 1, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - nb_pkv = 2 - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - if config.model_type == "bloom": - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_bs * num_key_value_heads, d_k, 1] - else: - new_shape = [input_bs * num_key_value_heads, 1, d_k] - - else: - new_shape = [input_bs, num_key_value_heads, 1, d_k] - - beam_idx_tmp = torch.zeros( - (2048, int(input_bs * num_beams)), dtype=torch.long - ).contiguous() - past_key_values = [ - ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - beam_idx_tmp, - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -IPEX_OPT_LLM_SUPPORTED_DICT = { - "2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], - "2.3": [ - "gptj", - "opt", - "llama", - "falcon", - "chatglm", - "baichuan", - "qwen", - "bloom", - "codegen", - "gptbigcode", - "t5", - "mixtral", - "mpt", - ], -} - -MODEL_TYPES_REQUIRING_POSITION_IDS = { - "codegen", - "gpt2", - "gpt-bigcode", - "gpt-neo", - "gpt-neox", - "gptj", - "imagegpt", - "llama", - "mistral", - "chatglm", -} - -if is_ipex_available() and ipex.__version__ == "2.2.0+cpu": - logger.info( - "ipex.llm.optimize by 2.2.0 version supported model family: {}".format( - ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]) - ) - ) - logger.info( - "The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version." - ) - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] -elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu": - logger.info( - "ipex.llm.optimize by 2.3.0 version supported model family: {}".format( - ", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]) - ) - ) - logger.info( - "The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version." - ) - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] -else: - logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] - -def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4): - """Generate the dummy example inputs.""" - prompt = "Welcome to use Intel Extension for Transformers." - prompt = [prompt] * batch_size - input_ids = tokenizer(prompt, return_tensors="pt").input_ids - model_type = model_config.model_type.replace("_", "-") - if model_type in IPEX_OPT_LLM_SUPPORTED: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=model_config, - input_bs=batch_size, - num_beams=num_beams - ) - else: - past_key_values = generate_dummy_past_key_values(config=model_config, input_bs=batch_size) - - input_ids = input_ids[:, :512] - if model_type in ["bloom", "qwen"]: - attention_mask = torch.ones(input_ids.shape[0], input_ids.shape[1] + 1) - attention_mask[:,0] = 0 - else: - attention_mask = torch.ones(input_ids.shape) - position_ids = torch.arange(input_ids.shape[1]).repeat(batch_size, 1) - - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - example_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": position_ids, - "past_key_values": past_key_values - } - else: - example_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values - } - return example_inputs - - -def make_torchscript_model(model, json_file_path, example_inputs): - """Recover ipex model from JSON file. - - Args: - model (object): fp32 model need to do quantization. - json_file_path (json): configuration JSON file for ipex. - example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function. - - Returns: - (object): quantized model - """ - - ipex = LazyImport("intel_extension_for_pytorch") - from torch.ao.quantization.observer import MinMaxObserver - - if ipex.__version__ >= "2.1.100": - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver) - else: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver()) - if isinstance(example_inputs, dict): - model = ipex.quantization.prepare(model, qconfig, example_kwarg_inputs=example_inputs, inplace=True) - else: - model = ipex.quantization.prepare(model, qconfig, example_inputs=example_inputs, inplace=True) - model.load_qconf_summary(qconf_summary=json_file_path) - model = ipex.quantization.convert(model, inplace=True) - model.eval() - with torch.no_grad(): - try: - if isinstance(example_inputs, dict): - # pylint: disable=E1120,E1123 - model = torch.jit.trace(model, example_kwarg_inputs=example_inputs) - else: - model = torch.jit.trace(model, example_inputs) - model = torch.jit.freeze(model.eval()) - except: - if isinstance(example_inputs, dict): - # pylint: disable=E1120,E1123 - model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False, check_trace=False) - else: - model = torch.jit.trace(model, example_inputs, strict=False) - model = torch.jit.freeze(model.eval()) - if isinstance(example_inputs, dict): - model(**example_inputs) - model(**example_inputs) - elif isinstance(example_inputs, tuple) or isinstance(example_inputs, list): - model(*example_inputs) - model(*example_inputs) - else: - model(example_inputs) - model(example_inputs) - return model - -def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remote_code=False): - """Recover ipex model from JSON file. - - Args: - model (object): fp32 model need to do quantization. - json_file_path (json): configuration JSON file for ipex. - trust_remote_code (bool): trust remote code. - - Returns: - (object): quantized model - """ - from transformers import AutoModelForCausalLM - - # ipex recovered int8 model from configure.json requests float32 model input and on cpu device. - user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, - trust_remote_code=trust_remote_code).float() - if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED: - import intel_extension_for_pytorch as ipex - qconfig = ipex.quantization.default_static_qconfig_mapping - user_model = ipex.optimize_transformers( - user_model.eval(), - dtype=torch.float, - inplace=True, - quantization_config=qconfig, - deployment_mode=False, - ) - - # tokenizer - if user_model.config.model_type == "llama": - from transformers import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained(user_model.config.name_or_path) - else: - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained( - user_model.config.name_or_path, trust_remote_code=trust_remote_code - ) - - # example_inputs - example_inputs = get_example_inputs(user_model.config, tokenizer=tokenizer) - - # pylint: disable=E0611 - user_model.config.torchscript = True - config = user_model.config - user_model = make_torchscript_model(user_model, json_file_path, example_inputs) - import intel_extension_for_pytorch as ipex - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, - ) - origin_model_type = config.model_type - if origin_model_type in ["chatglm", "qwen", "baichuan"]: - config.model_type = "qwen2" - user_model = TSModelCausalLMForITREX(user_model, config=config) - user_model.config.model_type = origin_model_type - return user_model From 27148ebe796fc401d2a9fadcc8281fc9dfa5ec74 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 13 Jun 2024 20:01:42 -0700 Subject: [PATCH 20/28] fix pylint Signed-off-by: changwangss --- .../neural_chat/models/model_utils.py | 3 ++- .../transformers/modeling/modeling_auto.py | 7 ++----- .../transformers/utils/utility.py | 3 ++- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py index 9c3e837c7d9..6bc2d71d7c7 100644 --- a/intel_extension_for_transformers/neural_chat/models/model_utils.py +++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py @@ -699,7 +699,8 @@ def load_model( assert ipex.__version__ >= "2.1.0+cpu", "Please use Intel Extension for PyTorch >=2.1.0+cpu." if re.search("falcon", model_name, re.IGNORECASE): assert transformers.__version__ <= "4.33.3", "Please pip install transformers==4.33.3" - from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import TSModelCausalLMForITREX + from intel_extension_for_transformers.transformers.llm.quantization.sq_utils import \ + TSModelCausalLMForITREX model = TSModelCausalLMForITREX.from_pretrained( model_name, file_name="best_model.pt" diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 28dc9715782..93dcaf00509 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -56,10 +56,6 @@ ) from ..utils.utility import ( CpuInfo, - generate_dummy_past_key_values, - generate_dummy_past_key_values_for_opt_llm, - MODEL_TYPES_REQUIRING_POSITION_IDS, - IPEX_OPT_LLM_SUPPORTED, WEIGHTS_NAME, WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, @@ -523,7 +519,8 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: if model_type not in cls.model_type_list: logger.error( - "Can't support this model_type. Please set the correct model_type, supported model_type: {}".format( + "Can't support this model_type." + + "Please set the correct model_type, supported model_type: {}".format( cls.model_type_list ) ) diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 092a3a33a58..527d8b097ff 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -18,7 +18,8 @@ import argparse import os -from neural_compressor.utils.utility import LazyImport +from neural_compressor.utils import logger +from neural_compressor.utils.utility import LazyImport, CpuInfo from intel_extension_for_transformers.tools.utils import is_ipex_available From a375b6fcf9df327810a0db182a5e62f08145d8e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 03:02:26 +0000 Subject: [PATCH 21/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/modeling/modeling_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 93dcaf00509..61f59a47dd6 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -519,7 +519,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: if model_type not in cls.model_type_list: logger.error( - "Can't support this model_type." + + "Can't support this model_type." + "Please set the correct model_type, supported model_type: {}".format( cls.model_type_list ) From 6b429c06d1c0334303ffbe65eb5648869587461f Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Fri, 14 Jun 2024 11:29:30 +0800 Subject: [PATCH 22/28] install requirements_pt.txt of inc3.x Signed-off-by: Sun, Xuehao --- .github/workflows/script/unitTest/env_setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/script/unitTest/env_setup.sh b/.github/workflows/script/unitTest/env_setup.sh index 838e3a4d98d..4afbf606c32 100644 --- a/.github/workflows/script/unitTest/env_setup.sh +++ b/.github/workflows/script/unitTest/env_setup.sh @@ -13,6 +13,7 @@ until [ "$n" -ge 5 ]; do git clone https://github.com/intel/neural-compressor.git /neural-compressor cd /neural-compressor pip install -r requirements.txt + pip install -r requirements_pt.txt python setup.py install && break n=$((n + 1)) sleep 5 From ed478c18f53b990d595bd3c5754ee2593ab84155 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 3 Jul 2024 02:10:30 -0700 Subject: [PATCH 23/28] rebase Signed-off-by: changwangss --- .../transformers/utils/utility.py | 407 ------------------ 1 file changed, 407 deletions(-) diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index 5f353b296ed..527d8b097ff 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -95,410 +95,3 @@ def __init__(self) -> None: self.dataset = dataloader.dataset return INCDataLoader() -<<<<<<< HEAD -======= - - -def generate_dummy_past_key_values(config, input_bs): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 0, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 0, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 0, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - if config.model_type == "bloom": - shape_key = (input_bs * num_attention_heads, d_k, 1) - shape_value = (input_bs * num_attention_heads, 1, d_k) - key = torch.ones(size=shape_key) - value = torch.ones(size=shape_value) - past_key_values = tuple( - tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) - for _ in range(num_layers) - ) - return past_key_values - elif config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - return past_key_values - elif config.model_type == "falcon": - new_shape = [input_bs, 1, 0, d_k] - else: - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -def generate_dummy_past_key_values_for_inference(config, input_bs): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 0, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 0, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 0, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - - if config.model_type == "bloom": - shape_key = (input_bs * num_attention_heads, d_k, 0) - shape_value = (input_bs * num_attention_heads, 0, d_k) - key = torch.empty(size=shape_key) - value = torch.empty(size=shape_value) - past_key_values = tuple( - tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) - for _ in range(num_layers) - ) - return past_key_values - elif config.model_type == "gpt_bigcode": - new_shape = [input_bs, 0, d_k * 2] - dummy_tensor = torch.zeros(size=new_shape) - past_key_values = tuple([dummy_tensor] * num_layers) - return past_key_values - elif config.model_type == "falcon": - new_shape = [input_bs, 1, 0, d_k] - else: - new_shape = [input_bs, num_key_value_heads, 0, d_k] - past_key_values = [ - ( - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1): - """Generate the dummy past_key_values.""" - from optimum.utils import NormalizedConfigManager - if config.model_type == "qwen": - new_shape = [ - input_bs, - 1, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "baichuan": - new_shape = [ - input_bs, - config.num_attention_heads, - 1, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_hidden_layers - elif config.model_type == "chatglm": - new_shape = [ - 1, - input_bs, - config.num_attention_heads, - config.hidden_size // config.num_attention_heads, - ] - num_layers = config.num_layers - else: - normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.model_type - )(config) - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - num_key_value_heads = num_attention_heads - nb_pkv = 2 - if hasattr(normalized_config, "num_key_value_heads"): - num_key_value_heads = normalized_config.num_key_value_heads - if hasattr(normalized_config, "multi_query_group_num"): - num_key_value_heads = normalized_config.multi_query_group_num - if config.model_type == "bloom": - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_bs * num_key_value_heads, d_k, 1] - else: - new_shape = [input_bs * num_key_value_heads, 1, d_k] - - else: - new_shape = [input_bs, num_key_value_heads, 1, d_k] - - beam_idx_tmp = torch.zeros( - (2048, int(input_bs * num_beams)), dtype=torch.long - ).contiguous() - past_key_values = [ - ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros(size=new_shape).contiguous(), - torch.zeros(size=new_shape).contiguous(), - beam_idx_tmp, - ) - for _ in range(num_layers) - ] - return tuple(past_key_values) - -IPEX_OPT_LLM_SUPPORTED_DICT = { - "2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"], - "2.3": [ - "gptj", - "opt", - "llama", - "falcon", - "chatglm", - "baichuan", - "qwen", - "bloom", - "codegen", - "gptbigcode", - "t5", - "mixtral", - "mpt", - ], -} - -MODEL_TYPES_REQUIRING_POSITION_IDS = { - "codegen", - "gpt2", - "gpt-bigcode", - "gpt-neo", - "gpt-neox", - "gptj", - "imagegpt", - "llama", - "mistral", - "chatglm", -} - -if is_ipex_available() and ipex.__version__ == "2.2.0+cpu": - logger.info( - "ipex.llm.optimize by 2.2.0 version supported model family: {}".format( - ",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]) - ) - ) - logger.info( - "The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version." - ) - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"] -elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu": - logger.info( - "ipex.llm.optimize by 2.3.0 version supported model family: {}".format( - ", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]) - ) - ) - logger.info( - "The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version." - ) - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] -else: - logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.") - IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"] - -def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4): - """Generate the dummy example inputs.""" - prompt = "Welcome to use Intel Extension for Transformers." - prompt = [prompt] * batch_size - input_ids = tokenizer(prompt, return_tensors="pt").input_ids - model_type = model_config.model_type.replace("_", "-") - if model_type in IPEX_OPT_LLM_SUPPORTED: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=model_config, - input_bs=batch_size, - num_beams=num_beams - ) - else: - past_key_values = generate_dummy_past_key_values(config=model_config, input_bs=batch_size) - - input_ids = input_ids[:, :512] - attention_mask = torch.ones(input_ids.shape) - position_ids = torch.arange(input_ids.shape[1]).repeat(batch_size, 1) - - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - example_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": position_ids, - "past_key_values": past_key_values - } - else: - example_inputs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values - } - return example_inputs - - -def make_torchscript_model(model, json_file_path, example_inputs): - """Recover ipex model from JSON file. - - Args: - model (object): fp32 model need to do quantization. - json_file_path (json): configuration JSON file for ipex. - example_inputs (tuple or torch.Tensor or dict): example inputs that will be passed to the ipex function. - - Returns: - (object): quantized model - """ - - ipex = LazyImport("intel_extension_for_pytorch") - from torch.ao.quantization.observer import MinMaxObserver - - if ipex.__version__ >= "2.1.100": - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver) - else: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5, act_observer=MinMaxObserver()) - if isinstance(example_inputs, dict): - model = ipex.quantization.prepare(model, qconfig, example_kwarg_inputs=example_inputs, inplace=True) - else: - model = ipex.quantization.prepare(model, qconfig, example_inputs=example_inputs, inplace=True) - model.load_qconf_summary(qconf_summary=json_file_path) - model = ipex.quantization.convert(model, inplace=True) - model.eval() - with torch.no_grad(): - try: - if isinstance(example_inputs, dict): - # pylint: disable=E1120,E1123 - model = torch.jit.trace(model, example_kwarg_inputs=example_inputs) - else: - model = torch.jit.trace(model, example_inputs) - model = torch.jit.freeze(model.eval()) - except: - if isinstance(example_inputs, dict): - # pylint: disable=E1120,E1123 - model = torch.jit.trace(model, example_kwarg_inputs=example_inputs, strict=False, check_trace=False) - else: - model = torch.jit.trace(model, example_inputs, strict=False) - model = torch.jit.freeze(model.eval()) - if isinstance(example_inputs, dict): - model(**example_inputs) - model(**example_inputs) - elif isinstance(example_inputs, tuple) or isinstance(example_inputs, list): - model(*example_inputs) - model(*example_inputs) - else: - model(example_inputs) - model(example_inputs) - return model - -def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remote_code=False): - """Recover ipex model from JSON file. - - Args: - model (object): fp32 model need to do quantization. - json_file_path (json): configuration JSON file for ipex. - trust_remote_code (bool): trust remote code. - - Returns: - (object): quantized model - """ - from transformers import AutoModelForCausalLM - - # ipex recovered int8 model from configure.json requests float32 model input and on cpu device. - user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, - trust_remote_code=trust_remote_code).float() - if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED: - import intel_extension_for_pytorch as ipex - qconfig = ipex.quantization.default_static_qconfig_mapping - user_model = ipex.optimize_transformers( - user_model.eval(), - dtype=torch.float, - inplace=True, - quantization_config=qconfig, - deployment_mode=False, - ) - - # tokenizer - if user_model.config.model_type == "llama": - from transformers import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained(user_model.config.name_or_path) - else: - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained( - user_model.config.name_or_path, trust_remote_code=trust_remote_code - ) - - # example_inputs - example_inputs = get_example_inputs(user_model.config, tokenizer=tokenizer) - - # pylint: disable=E0611 - user_model.config.torchscript = True - config = user_model.config - user_model = make_torchscript_model(user_model, json_file_path, example_inputs) - import intel_extension_for_pytorch as ipex - from intel_extension_for_transformers.transformers.llm.evaluation.models import ( - TSModelCausalLMForITREX, - ) - origin_model_type = config.model_type - if origin_model_type in ["chatglm", "qwen", "baichuan"]: - config.model_type = "qwen2" - user_model = TSModelCausalLMForITREX(user_model, config=config) - user_model.config.model_type = origin_model_type - return user_model ->>>>>>> main From 634ef3eb7364e9fc5e5338e1063029d2918c663f Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 3 Jul 2024 21:10:23 -0700 Subject: [PATCH 24/28] update sq Signed-off-by: changwangss --- .../quantization/run_generation_sq.py | 7 +- .../transformers/llm/quantization/sq_utils.py | 3 +- .../transformers/llm/quantization/utils.py | 1 + .../transformers/modeling/modeling_auto.py | 417 ++++-------------- 4 files changed, 95 insertions(+), 333 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index 7b34ea720f7..c3f7afed86f 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -170,10 +170,7 @@ quantization_config.remove_redundant_parameters() config.quantization_config = quantization_config config.save_pretrained(args.output_dir) - torch.jit.save(user_model, args.output_dir + "/pytorch_model.bin") - with open(args.output_dir + "/best_configure.json", "w") as f: - json.dump(user_model.tune_cfg, f, indent=4) - # validate loading + user_model.save(args.output_dir) user_model = AutoModelForCausalLM.from_pretrained( args.output_dir, trust_remote_code=args.trust_remote_code, @@ -188,7 +185,7 @@ ) user_model = recover_model_from_json( args.model, - os.path.join(args.output_dir, "best_configure.json"), + os.path.join(args.output_dir, "qconfig.json"), args.trust_remote_code, ) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index 634ea7499c6..bb84709ff45 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -324,8 +324,7 @@ def _reorder_cache( This is required to match `past_key_values` with the correct beam_idx at every generation step. """ - if self.config.model_type == "bloom": - return self._reorder_cache_bloom(past_key_values, beam_idx) + if self.config.model_type == "chatglm": return tuple( tuple( diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 4a24dc7121d..89e6194799f 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -832,6 +832,7 @@ def calib_func(model): alpha_step=quantization_config.alpha_step, shared_criterion=quantization_config.shared_criterion, do_blockwise=quantization_config.do_blockwise, + excluded_precisions=quantization_config.excluded_precisions, ) # fallback if model_type in ["gptj", "gpt_neox", "mpt"]: diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index f90203d5ee4..17e1a4d8cfa 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -943,41 +943,73 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: model.config.architectures = ["MptForCausalLM"] model.eval() logger.info("Applying SmoothQuant.") -<<<<<<< HEAD model = convert_to_smoothquant_model(model, quantization_config) -======= - # ipex.optimize_transformers - if quantization_config.ipex_opt_llm is None: - if model_type in IPEX_OPT_LLM_SUPPORTED: - quantization_config.ipex_opt_llm = True - logger.info( - "quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used." + logger.info("SmoothQuant done.") + elif isinstance(quantization_config, DynamicQuantConfig): + model = cls.ORIG_MODEL.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + low_cpu_mem_usage=True, + torch_dtype=torch.float, + **kwargs, + ) + + if ( + not torch.cuda.is_available() + or device_map == "cpu" + or device_map == torch.device("cpu") + ) and model.config.model_type == "chatglm": + model = model.float() + model.eval() + logger.info("Applying DynamicQuant.") + # call inc dynamic quant + from neural_compressor import PostTrainingQuantConfig, quantization + + conf = PostTrainingQuantConfig( + approach="dynamic", + excluded_precisions=quantization_config.excluded_precisions, + op_type_dict=quantization_config.op_type_dict, + op_name_dict=quantization_config.op_name_dict, + ) + model = quantization.fit( + model, + conf, + ) + model.save_pretrained = types.MethodType(save_low_bit, model) + quantization_config.remove_redundant_parameters() + model.quantization_config = quantization_config + logger.info("DynamicQuant done.") + return model + elif isinstance(quantization_config, StaticQuantConfig): + if quantization_config.backend == "ipex": + try: + import intel_extension_for_pytorch as ipex + except ImportError: + logger.warning( + "Please install Intel Extension for PyTorch to accelerate the model inference." ) - logger.warning("The suggested transformers version is 4.38.1.") - else: - quantization_config.ipex_opt_llm = False - if quantization_config.ipex_opt_llm: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=0.5) - model = ipex.optimize_transformers( - model.eval(), - quantization_config=qconfig, - dtype=torch.float32, - inplace=True, - deployment_mode=False, - ) - model.eval() + config.torchscript = True + assert quantization_config.example_inputs is not None, \ + "Please provide example_inputs for IPEX static quantization." - # past_key_values - num_beams = quantization_config.num_beams - if quantization_config.ipex_opt_llm: - past_key_values = generate_dummy_past_key_values_for_opt_llm( - config=model.config, input_bs=1, num_beams=num_beams - ) - else: - past_key_values = generate_dummy_past_key_values( - config=model.config, input_bs=1 - ) + model = cls.ORIG_MODEL.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + low_cpu_mem_usage=True, + torch_dtype=torch.float, + **kwargs, + ) + if ( + not torch.cuda.is_available() + or device_map == "cpu" + or device_map == torch.device("cpu") + ) and model.config.model_type == "chatglm": + model = model.float() + model.eval() + logger.info("Applying StaticQuant.") # calibration function calib_func = quantization_config.calib_func tokenizer = quantization_config.tokenizer @@ -1029,10 +1061,8 @@ def tokenize_function(examples): return example def collate_batch(batch): - position_ids_padded = [] input_ids_padded = [] last_ind = [] - attention_mask_padded = [] for text in batch: input_ids = text["input_ids"] if not calib_padding: @@ -1048,99 +1078,30 @@ def collate_batch(batch): ) last_ind.append(input_ids.shape[0] - 1) - attention_mask = torch.ones(len(input_ids)) - position_ids = torch.arange(len(input_ids)) input_ids_padded.append(input_ids) - attention_mask_padded.append(attention_mask) - position_ids_padded.append(position_ids) - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - return ( - { - "input_ids": torch.vstack(input_ids_padded), - "attention_mask": torch.vstack(attention_mask_padded), - "position_ids": torch.vstack(position_ids_padded), - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - else: - return ( - { - "input_ids": torch.vstack(input_ids_padded), - "attention_mask": torch.vstack(attention_mask_padded), - "past_key_values": past_key_values, - }, - torch.tensor(last_ind), - ) - def collate_batch_for_chatglm(batch): - last_ind = [] - for text in batch: - input_ids = torch.vstack([text["input_ids"]]) - if re.search( - "THUDM/chatglm-6b", model.config.auto_map["AutoConfig"] - ): - input_ids = ( - input_ids[:, :calib_len] - if input_ids.shape[1] > calib_len - else input_ids - ) - eos = torch.tensor([130001, 130004]).repeat(1, 1) - input_ids = torch.cat((input_ids, eos), 1) - else: - input_ids = ( - input_ids[:, :calib_len] - if input_ids.shape[1] > calib_len - else input_ids - ) - prepared_inputs = model.prepare_inputs_for_generation(input_ids) - attention_mask = torch.ones_like(input_ids) - last_ind.append(input_ids.shape[1] - 1) return ( { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": prepared_inputs["position_ids"], - "past_key_values": past_key_values, + "input_ids": torch.vstack(input_ids_padded), }, torch.tensor(last_ind), ) tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - if model_type == "chatglm": - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch_for_chatglm, - ) - else: - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) def calib_func(model): with torch.no_grad(): for i, (inputs, last_ind) in enumerate(calib_dataloader): if i >= calib_iters: break - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - model( - input_ids=inputs["input_ids"], - past_key_values=inputs["past_key_values"], - position_ids=inputs["position_ids"], - attention_mask=inputs["attention_mask"], - ) - else: - model( - input_ids=inputs["input_ids"], - past_key_values=inputs["past_key_values"], - attention_mask=inputs["attention_mask"], - ) + model(**inputs) logger.info( "The default calibration function is used, " @@ -1149,236 +1110,26 @@ def calib_func(model): ) calib_func = calib_func - # example_inputs - example_inputs = quantization_config.example_inputs - if example_inputs is None: - for i, (inputs, last_ind) in enumerate(calib_dataloader): - if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: - example_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "position_ids": inputs["position_ids"], - "past_key_values": inputs["past_key_values"], - } - else: - example_inputs = { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "past_key_values": inputs["past_key_values"], - } - break - - # call inc sq + # call inc static quant from neural_compressor import PostTrainingQuantConfig, quantization conf = PostTrainingQuantConfig( - backend=quantization_config.backend, # default is ipex + backend=quantization_config.backend, excluded_precisions=quantization_config.excluded_precisions, op_type_dict=quantization_config.op_type_dict, op_name_dict=quantization_config.op_name_dict, - recipes=quantization_config.recipes, - example_inputs=example_inputs, + example_inputs=quantization_config.example_inputs, ) - model = quantization.fit( model, conf, calib_func=calib_func, - calib_dataloader=( - calib_dataloader - if quantization_config.recipes["smooth_quant_args"]["alpha"] - == "auto" - else None - ), - ) ->>>>>>> main - logger.info("SmoothQuant done.") - elif isinstance(quantization_config, DynamicQuantConfig): - model = cls.ORIG_MODEL.from_pretrained( - pretrained_model_name_or_path, - *model_args, - config=config, - low_cpu_mem_usage=True, - torch_dtype=torch.float, - **kwargs, - ) - - if ( - not torch.cuda.is_available() - or device_map == "cpu" - or device_map == torch.device("cpu") - ) and model.config.model_type == "chatglm": - model = model.float() - model.eval() - logger.info("Applying DynamicQuant.") - # call inc dynamic quant - from neural_compressor import PostTrainingQuantConfig, quantization - - conf = PostTrainingQuantConfig( - approach="dynamic", - excluded_precisions=quantization_config.excluded_precisions, - op_type_dict=quantization_config.op_type_dict, - op_name_dict=quantization_config.op_name_dict, - ) - model = quantization.fit( - model, - conf, ) model.save_pretrained = types.MethodType(save_low_bit, model) quantization_config.remove_redundant_parameters() model.quantization_config = quantization_config - logger.info("DynamicQuant done.") + logger.info("StaticQuant done.") return model - # elif isinstance(quantization_config, StaticQuantConfig): - # if quantization_config.backend == "ipex": - # try: - # import intel_extension_for_pytorch as ipex - # except ImportError: - # logger.warning( - # "Please install Intel Extension for PyTorch to accelerate the model inference." - # ) - # config.torchscript = True - # assert quantization_config.example_inputs is not None, \ - # "Please provide example_inputs for IPEX static quantization." - - # model = cls.ORIG_MODEL.from_pretrained( - # pretrained_model_name_or_path, - # *model_args, - # config=config, - # low_cpu_mem_usage=True, - # torch_dtype=torch.float, - # **kwargs, - # ) - - # if ( - # not torch.cuda.is_available() - # or device_map == "cpu" - # or device_map == torch.device("cpu") - # ) and model.config.model_type == "chatglm": - # model = model.float() - # model.eval() - # logger.info("Applying StaticQuant.") - # # calibration function - # calib_func = quantization_config.calib_func - # tokenizer = quantization_config.tokenizer - # if calib_func is None: - # if quantization_config.tokenizer is None: - # logger.error( - # "Please provide the tokenizer or provide calib_func directly," - # + " the following is how to get tokenizer. \n" - # + " from transformer import AutoTokenizer \n" - # + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - # ) - # exit(0) - - # from datasets import load_dataset - # from torch.utils.data import DataLoader - - # calib_dataset = quantization_config.calib_dataset - # calib_shuffle = quantization_config.calib_shuffle - # calib_iters = quantization_config.calib_iters - # calib_padding = quantization_config.calib_padding - # calib_len = quantization_config.calib_len - # calib_pad_val = quantization_config.calib_pad_val - # from torch.nn.functional import pad - - # calib_dataset = load_dataset( - # calib_dataset, - # split=( - # "test" - # if calib_dataset in ["mbpp", "openai_humaneval"] - # else "train" - # ), - # ) - # if calib_shuffle: - # calib_dataset = calib_dataset.shuffle(seed=42) - - # def tokenize_function(examples): - # if "code" in examples: - # example = tokenizer(examples["code"]) - # elif "prompt" in examples: - # example = tokenizer(examples["prompt"]) - # elif "text" in examples: - # example = tokenizer(examples["text"]) - # else: - # logger.error( - # "Please check dataset prompt identifier," - # + " NeelNanda/pile-10k is default used calibration dataset." - # ) - # exit(0) - # return example - - # def collate_batch(batch): - # input_ids_padded = [] - # last_ind = [] - # for text in batch: - # input_ids = text["input_ids"] - # if not calib_padding: - # input_ids = ( - # input_ids[: int(calib_len)] - # if len(input_ids) > int(calib_len) - # else input_ids - # ) # no_padding - # else: - # pad_len = calib_len - input_ids.shape[0] - # input_ids = pad( - # input_ids, (0, pad_len), value=calib_pad_val - # ) - - # last_ind.append(input_ids.shape[0] - 1) - # input_ids_padded.append(input_ids) - - # return ( - # { - # "input_ids": torch.vstack(input_ids_padded), - # }, - # torch.tensor(last_ind), - # ) - - # tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - # tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - # calib_dataloader = DataLoader( - # tokenized_dataset, - # batch_size=1, - # shuffle=False, - # collate_fn=collate_batch, - # ) - - # def calib_func(model): - # with torch.no_grad(): - # for i, (inputs, last_ind) in enumerate(calib_dataloader): - # if i >= calib_iters: - # break - # model(**inputs) - - # logger.info( - # "The default calibration function is used, " - # + "the calibration dataset is NeelNanda/pile-10k, " - # + "batchsize is 1 and calibration iteration is 100." - # ) - # calib_func = calib_func - - # # call inc static quant - # from neural_compressor.torch.quantization import StaticQuantConfig, convert, prepare - # quant_config = StaticQuantConfig( - # w_dtype=quantization_config.w_dtype, - # w_sym=quantization_config.w_sym, - # w_granularity=quantization_config.w_granularity, - # w_algo=quantization_config.w_algo, - # act_dtype=quantization_config.act_dtype, - # act_sym=quantization_config.act_sym, - # act_granularity=quantization_config.act_granularity, - # act_algo=quantization_config.act_algo, - # white_list=quantizate_config.white_list, - # ) - # prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs) - # calib_func(prepared_model) - # q_model = convert(prepared_model) - # model.save_pretrained = types.MethodType(save_low_bit, model) - # quantization_config.remove_redundant_parameters() - # model.quantization_config = quantization_config - # logger.info("StaticQuant done.") - # return model elif isinstance(quantization_config, QuantAwareTrainingConfig): model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, @@ -1732,6 +1483,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # index of the files. is_sharded = False sharded_metadata = None + if pretrained_model_name_or_path is not None: pretrained_model_name_or_path = str(pretrained_model_name_or_path) is_local = os.path.isdir(pretrained_model_name_or_path) @@ -1749,6 +1501,20 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): subfolder, _add_variant(WEIGHTS_NAME, variant), ) + # only for inc sq + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant("quantized_model.pt", variant), + ) + ): + # Load from a PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant("quantized_model.pt", variant), + ) elif os.path.isfile( os.path.join( pretrained_model_name_or_path, @@ -1897,7 +1663,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}." ) from e - if is_local: logger.info(f"loading weights file {archive_file}") resolved_archive_file = archive_file @@ -1951,7 +1716,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) q_model = torch.jit.load( - os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + os.path.join(pretrained_model_name_or_path, "quantized_model.pt") ) origin_model_type = config.model_type if origin_model_type in ["chatglm", "qwen", "baichuan"]: From 565c3778118d2e51dd814b0525ac6e9c63e8a4dd Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 3 Jul 2024 21:16:27 -0700 Subject: [PATCH 25/28] remove woq hqq Signed-off-by: changwangss --- .../quantization/run_generation_cpu_woq.py | 12 +---- .../transformers/__init__.py | 1 - .../transformers/llm/quantization/utils.py | 14 ----- .../transformers/modeling/modeling_auto.py | 3 +- .../transformers/utils/__init__.py | 1 - .../transformers/utils/config.py | 54 ------------------- 6 files changed, 2 insertions(+), 83 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py index cf38ed87748..cd59d9c4086 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py @@ -12,7 +12,6 @@ from intel_extension_for_transformers.transformers import ( BitsAndBytesConfig, RtnConfig, - HqqConfig, AwqConfig, TeqConfig, GPTQConfig, @@ -49,7 +48,7 @@ parser.add_argument( "--woq_algo", default="Rtn", - choices=["Rtn", "Awq", "Teq", "GPTQ", "AutoRound", "HQQ"], + choices=["Rtn", "Awq", "Teq", "GPTQ", "AutoRound"], help="Weight-only algorithm.", ) parser.add_argument( @@ -225,15 +224,6 @@ layer_wise=args.layer_wise, use_ipex=args.use_ipex, ) - elif args.woq_algo == "HQQ": - quantization_config = HqqConfig( - bits=args.bits, - group_size=args.group_size, - compute_dtype=args.compute_dtype, - scale_dtype=args.scale_dtype, - weight_dtype=args.weight_dtype, - use_ipex=args.use_ipex, - ) elif args.woq_algo == "Awq": quantization_config = AwqConfig( tokenizer=tokenizer, diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 82d4475ba2d..94b8091c60e 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -33,7 +33,6 @@ DynamicQuantConfig, QuantAwareTrainingConfig, RtnConfig, - HqqConfig, AwqConfig, TeqConfig, GPTQConfig, diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 89e6194799f..eb18e88baab 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -28,7 +28,6 @@ AutoRoundConfig, AWQConfig, GPTQConfig, - HQQConfig, RTNConfig, SmoothQuantConfig, TEQConfig, @@ -554,19 +553,6 @@ def convert_to_quantized_model(model, config, device="cpu"): quant_config.set_local(".*embed_out", RTNConfig(dtype="fp32")) model = prepare(model, quant_config) model = convert(model) - elif config.quant_method.value == "hqq": - quant_config = HQQConfig( - bits=config.bits, - group_size=config.group_size, - quant_zero=config.quant_zero, - quant_scale=config.quant_scale, - scale_quant_group_size=config.scale_quant_group_size, - skip_lm_head=config.skip_lm_head, - ) - model = prepare(model, quant_config) - model = convert(model) - # qits module doesn't match with HQQ algorithm. - return model elif config.quant_method.value == "awq": quant_config = AWQConfig( dtype=dtype, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 17e1a4d8cfa..c10b3bec0b5 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -46,7 +46,6 @@ DynamicQuantConfig, QuantAwareTrainingConfig, RtnConfig, - HqqConfig, AwqConfig, TeqConfig, GPTQConfig, @@ -810,7 +809,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: logger.info("Mixed Precision done.") elif isinstance( quantization_config, - (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig, HqqConfig), + (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig), ): logger.info("Applying Weight Only Quantization.") if use_neural_speed: diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index ee258fbd797..4eaba5a00fe 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -25,7 +25,6 @@ QuantAwareTrainingConfig, SparsityConfig, RtnConfig, - HqqConfig, AwqConfig, TeqConfig, GPTQConfig, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 3f6da8c8a05..c39d9788cff 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -50,7 +50,6 @@ class QuantizationMethod(str, Enum): AWQ = "awq" AQLM = "aqlm" RTN = "rtn" - HQQ = "hqq" AUTOROUND = "autoround" TEQ = "teq" DYNAMIC = "dynamic" @@ -851,59 +850,6 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict -class HqqConfig(ITREXQuantizationConfigMixin): - def __init__( - self, - bits: int = 4, - group_size: int = 64, - sym: bool = True, - compute_dtype: Any = None, - weight_dtype: Any = None, - scale_dtype: Any = None, - quant_zero: bool = True, - quant_scale: bool = False, - scale_quant_group_size: int = 128, - skip_lm_head: bool = True, - **kwargs, - ): - self.quant_method = QuantizationMethod.HQQ - self.bits = bits - self.weight_dtype = weight_dtype - self.compute_dtype = compute_dtype - self.scale_dtype = scale_dtype - self.use_double_quant = False - self.sym = sym - self.scheme = "sym" if self.sym else "asym" - self.group_size = group_size - self.quant_zero = quant_zero - self.quant_scale = quant_scale - self.scale_quant_group_size = scale_quant_group_size - self.skip_lm_head = skip_lm_head - self.llm_int8_skip_modules = kwargs.get("llm_int8_skip_modules", []) - self.device = kwargs.get("device", "auto") - self.use_ipex = kwargs.pop("use_ipex", False) - - def to_diff_dict(self) -> Dict[str, Any]: - """Removes all attributes from config which correspond to the default config attributes - for better readability and serializes to a Python dictionary. - - Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, - """ - config_dict = self.to_dict() - - # get the default config dict - default_config_dict = RtnConfig().to_dict() - - serializable_config_dict = {} - - # only serialize values that differ from the default config - for key, value in config_dict.items(): - if value != default_config_dict[key]: - serializable_config_dict[key] = value - - return serializable_config_dict - class GPTQConfig(ITREXQuantizationConfigMixin): def __init__( self, From 54c8157c3562c06e281b25c31a62bf2562ce29ff Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 3 Jul 2024 22:48:00 -0700 Subject: [PATCH 26/28] fix pylint Signed-off-by: changwangss --- .../transformers/modeling/modeling_auto.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index c10b3bec0b5..f691f951b15 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -366,26 +366,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if use_vllm is not None: logger.info("The backend is vLLM.") from vllm import LLM # pylint: disable=E1101 - from vllm.model_executor.model_loader import ( - get_model_loader, - ) # pylint: disable=E0611 - from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - ) # pylint: disable=E0401 disable=E0611 + from vllm.model_executor.model_loader import get_model_loader # pylint: disable=E0611 + from vllm.model_executor.model_loader.weight_utils import default_weight_loader # pylint: disable=E0401 disable=E0611 from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, ColumnParallelLinear, - RowParallelLinear, - ) # pylint: disable=E1101 + RowParallelLinear) # pylint: disable=E1101 os.environ["backend"] = "use_vllm" llm = LLM( model=pretrained_model_name_or_path, trust_remote_code=True ) # Create an vllm instance. - model = ( - llm.llm_engine.model_executor.driver_worker.model_runner.model - ) # pylint: disable=E1101 + model = llm.llm_engine.model_executor.driver_worker.model_runner.model # pylint: disable=E1101 print("Original model =", model) original_parameter_memo = dict() @@ -447,9 +440,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: ) print("Optimized model =", model) - loader = get_model_loader( - llm.llm_engine.load_config - ) # pylint: disable=E1101 + loader = get_model_loader(llm.llm_engine.load_config) # pylint: disable=E1101 weights_iterator = loader._get_weights_iterator( llm.llm_engine.model_config.model, @@ -457,9 +448,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: fall_back_to_pt=True, ) - from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - ) # pylint: disable=E0401 disable=E0611 + from vllm.model_executor.model_loader.weight_utils import default_weight_loader # pylint: disable=E0401 disable=E0611 params_dict = dict(model.named_parameters(remove_duplicate=False)) for name in params_dict.keys(): From f77c8f88009cee4802e02c7e25989d0b1fbef156 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 10 Jul 2024 07:22:55 -0700 Subject: [PATCH 27/28] fit to the latest inc Signed-off-by: changwangss --- .../llm/quantization/autograd/functions.py | 8 +- .../llm/quantization/nn/modules.py | 28 +- .../transformers/llm/quantization/sq_utils.py | 1 - .../transformers/llm/quantization/utils.py | 94 +++--- .../transformers/modeling/modeling_auto.py | 42 ++- .../transformers/utils/config.py | 6 +- tests/CI/test_quantization.py | 275 +++++++++--------- tests/CI/test_weight_only.py | 28 +- tests/CI/test_weight_only_gpu.py | 4 +- 9 files changed, 235 insertions(+), 251 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/autograd/functions.py b/intel_extension_for_transformers/transformers/llm/quantization/autograd/functions.py index 68f17dca5cc..483e27da94b 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/autograd/functions.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/autograd/functions.py @@ -39,17 +39,17 @@ class qbits_acquire_type(Enum): def qbits_woq_linear_ref_impl(activation, packw, bias, compute_type, weight_type, scale_type): - assert (activation.is_contiguous()) - assert (packw.is_contiguous()) activation = activation.to(torch.float32) n = qbits.acquire_packed_weight_info( packw, qbits_acquire_type.N.value)[0].item() k = activation.shape[1] revert_wei = torch.empty(k, n, dtype=torch.float) qbits.dequantize_packed_weight( - packw, revert_wei, False, compute_type, weight_type, scale_type) + packw, revert_wei, False, compute_type, weight_type, "fp32") + enable_act_shuffle = qbits.acquire_packed_weight_info( packw, qbits_acquire_type.ACT_SHUFFLE.value)[0] != 0 + if enable_act_shuffle: g_idx = qbits.acquire_packed_weight_info( packw, qbits_acquire_type.G_IDX.value) @@ -59,6 +59,7 @@ def qbits_woq_linear_ref_impl(activation, packw, bias, compute_type, weight_typ assert (bias.is_contiguous()) assert (bias.dtype == torch.float32) out += bias + return out @@ -117,6 +118,7 @@ def forward( False if scheme == "sym" else True, ) else: + out = qbits_woq_linear_ref_impl( A, B.data, bias, compute_dtype, weight_dtype, scale_dtype) output = out diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py index da527672eb3..51eccf739dd 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py @@ -25,6 +25,7 @@ from intel_extension_for_transformers.transformers.llm.quantization.autograd import ( matmul_kbit, ) import intel_extension_for_transformers.qbits as qbits # pylint: disable=E0611, E0401 +from neural_compressor.torch.algorithms.weight_only.utility import quant_tensor as quant_nf4_fp4 class DropoutQBits_(torch.autograd.Function): @@ -221,10 +222,14 @@ def set_weights_bias( g_idx = torch.empty(0, dtype=torch.int32) else: g_idx = torch.empty(0, dtype=torch.int32) - if q_config.bits == 4: + if q_config.bits == 4 and 'f' not in q_config.weight_dtype: int_weight = (int_weight - 8) * 16 // 16 gptq_zeros = (gptq_zeros - 8) * 16 // 16 + if q_config.weight_dtype in ["nf4", "fp4", "fp4_e2m1"]: + int_weight = torch.where(int_weight < 0, int_weight + 16, int_weight) + int_weight = int_weight.t_() + gptq_scales = gptq_scales.t_() if q_config.sym: gptq_zeros = torch.empty(0, dtype=torch.int8) @@ -329,7 +334,7 @@ def recover_int_weight(g_idx, int_weight): g_idx = None weight_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 6) weight_dtype = "".join(chr(ascii_code) for ascii_code in weight_dtype_ascii.tolist()) - bits = 4 if weight_dtype in ["nf4", "int4_clip", "fp4_e2m1", "fp4_e2m1_bnb"] else 8 + bits = 4 if weight_dtype in ["nf4", "int4_clip", "fp4_e2m1"] else 8 compute_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 7) compute_dtype = "".join(chr(ascii_code) for ascii_code in compute_dtype_ascii.tolist()) scales_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 8) @@ -352,12 +357,19 @@ def recover_int_weight(g_idx, int_weight): qbits.dequantize_packed_weight(self.weight, revert_wei, False, compute_dtype, weight_dtype, scales_dtype) - int_weight = self.quant_weight_w_scale( - revert_wei.t(), - scales.t(), - qzeros.to(torch.uint8).t() if qzeros is not None else None, - group_size=group_size, - ) + if weight_dtype in ["nf4", "fp4", "fp4_e2m1"]: + int_weight = quant_nf4_fp4(revert_wei.t(), + bits=bits, + group_size=group_size, + dtype=weight_dtype, + return_int=True)[0] + else: + int_weight = self.quant_weight_w_scale( + revert_wei.t(), + scales.t(), + qzeros.to(torch.uint8).t() if qzeros is not None else None, + group_size=group_size, + ) if g_idx is not None: int_weight = recover_int_weight(g_idx, int_weight.t()) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index bb84709ff45..27cfb81764b 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -84,7 +84,6 @@ "llama", "mistral", "chatglm", - "baichuan", } diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 8d43d29dde6..58dcf2edd59 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -68,7 +68,6 @@ DTYPE_BITS_MAPPING = { "nf4": 4, "fp4": 4, # fp4 == fp4_e2m1 - "fp4_e2m1_bnb": 4, "fp4_e2m1": 4, "int4": 4, "int4_fullrange": 4, @@ -84,27 +83,30 @@ def unpack_weight(qweight, scales, qzeros, q_config): sym = q_config.sym bits = q_config.bits wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32).unsqueeze(0) + if qzeros is not None: + zeros = torch.bitwise_right_shift( + torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0) + ).to(torch.int16 if bits == 8 else torch.int8) + torch.bitwise_and(zeros, (2**bits) - 1, out=zeros) + if bits == 8: + zeros = zeros.to(torch.int8 if sym else torch.uint8) + # due to INC minus one + zeros = zeros + 1 + try: + zeros = zeros.reshape(scales.shape) + except: + # zeros and scales have different iteam numbers. + # remove 1 (due to 0 + 1 in line 68) + zeros = zeros[zeros != 1] + zeros = zeros.reshape(scales.shape) - zeros = torch.bitwise_right_shift( - torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0) - ).to(torch.int16 if bits == 8 else torch.int8) - torch.bitwise_and(zeros, (2**bits) - 1, out=zeros) - if bits == 8: - zeros = zeros.to(torch.int8 if sym else torch.uint8) - # due to INC minus one - zeros = zeros + 1 - try: - zeros = zeros.reshape(scales.shape) - except: - # zeros and scales have different iteam numbers. - # remove 1 (due to 0 + 1 in line 68) - zeros = zeros[zeros != 1] - zeros = zeros.reshape(scales.shape) - - # due to INC asym return torch.uint8 but backend request int8, - # change it to int8 with offset 128 - if not sym and bits == 8: - zeros = (zeros.to(torch.int32) - 128).to(torch.int8) + # due to INC asym return torch.uint8 but backend request int8, + # change it to int8 with offset 128 + if not sym and bits == 8: + zeros = (zeros.to(torch.int32) - 128).to(torch.int8) + zeros = zeros.contiguous() + else: + zeros = None weight = torch.bitwise_right_shift( torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1) @@ -120,7 +122,7 @@ def unpack_weight(qweight, scales, qzeros, q_config): # change it to int8 with offset 128 if not sym: weight = (weight.to(torch.int32) - 128).to(torch.int8) - return weight.contiguous(), scales.contiguous(), zeros.contiguous() + return weight.contiguous(), scales.contiguous(), zeros def replace_linear( @@ -180,9 +182,6 @@ def _replace_linear( quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "fp4_e2m1_bnb", - "fp4_e2m1", - "nf4", "int4_fullrange", ] @@ -272,9 +271,6 @@ def _replace_linear( quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "fp4_e2m1_bnb", - "fp4_e2m1", - "nf4", ] model._modules[name] = QuantizedLinearQBits( @@ -374,22 +370,25 @@ def _replace_linear( if quantization_config.weight_dtype in [ "fp8_e5m2", "fp8_e4m3", - "nf4", - "fp4_e2m1_bnb", - "fp4_e2m1", ]: model._modules[name].set_fp_weights_bias( module.weight.data, None if module.bias is None else module.bias.data, ) else: - int_weight, scales, zeros = unpack_weight( - module.qweight, - module.scales, - module.qzeros, - quantization_config, - ) - int_weight = int_weight.view(-1, int_weight.shape[-1]) + if quantization_config.weight_dtype in ["int4", "int4_clip", "int8"]: + int_weight, scales, zeros = unpack_weight( + module.qweight, + module.scales, + module.qzeros if hasattr(module, "qzeros") else None, + quantization_config, + ) + int_weight = int_weight.view(-1, int_weight.shape[-1]) + else: + int_weight = module.unpack_tensor_with_numpy(module.qweight) + scales = module.scales + zeros = module.qzeros if hasattr(module, "qzeros") else None + model._modules[name].set_weights_bias( int_weight, scales, @@ -601,7 +600,6 @@ def convert_to_quantized_model(model, config, device="cpu"): use_layer_wise=config.layer_wise, absorb_to_layer=config.absorb_to_layer ) - assert config.absorb_to_layer != {}, "absorb_to_layer is necessary for TEQ algorithm" quant_config.set_local(".*lm_head", TEQConfig(dtype="fp32")) quant_config.set_local(".*output_layer", TEQConfig(dtype="fp32")) quant_config.set_local(".*embed_out", TEQConfig(dtype="fp32")) @@ -619,6 +617,7 @@ def convert_to_quantized_model(model, config, device="cpu"): model = prepare(model=model, quant_config=quant_config, example_inputs=example_inputs) run_fn(model, *run_args) model = convert(model) + elif config.quant_method.value == "gptq": model.seqlen = config.seq_len quant_config = GPTQConfig( @@ -682,21 +681,14 @@ def convert_to_quantized_model(model, config, device="cpu"): assert False, "The Supported algorithm are RTN, AWQ, TEQ, GPTQ, AUTOROUND" if device == "xpu" or device == torch.device("xpu"): - model = model.export_compressed_model( - compression_dtype=torch.int8, - compression_dim=0, - use_optimum_format=False, - scale_dtype=convert_dtype_str2torch(config.scale_dtype), - device="xpu", - ) if _ipex_version < "2.3.10" else inc_model.export_compressed_model(use_optimum_format=True, device="xpu") - - q_model = replace_linear(model, None, None, config, device=device) - else: - model.eval() - q_model = replace_linear(model, None, None, config, device=device) + logger.warning("The recommended ipex version is higher than 2.3.10 for xpu device.") + + model.eval() + q_model = replace_linear(model, None, None, config, device=device) if orig_dtype != torch.float32: q_model.to(dtype=orig_dtype) + return q_model.to(device) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 708f34326c0..6251b308be2 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -123,6 +123,7 @@ def recover_export_model(model, current_key_name=None): int_weight, ) = module.recover_qparms() dtype = "int4" if weight_dtype == "int4_clip" else weight_dtype + use_optimum_format = False if weight_dtype in ["fp4_e2m1", "fp4", "nf4"] else True model._modules[name] = WeightOnlyLinear( in_features, out_features, @@ -133,17 +134,20 @@ def recover_export_model(model, current_key_name=None): bias=module.bias is not None, scale_dtype=scales_dtype, g_idx=desc_act, - use_optimum_format=True, + use_optimum_format=use_optimum_format, ) # Setting g_idx is invalid when use_optimum_format is True, so set it again when g_idx is not None. # https://github.com/intel/neural-compressor/blob/v2.5.dev2/neural_compressor/adaptor/torch_utils/ # model_wrapper.py#L343 model._modules[name].pack( - int_weight, scales, zeros, module.bias, g_idx=g_idx + int_weight.contiguous(), + scales.contiguous(), + zeros.contiguous() if zeros is not None else None, + module.bias.contiguous() if module.bias is not None else None, ) if g_idx is not None: - model._modules[name].g_idx = g_idx + model._modules[name].g_idx = g_idx.contiguous() if len(list(module.children())) > 0: # pylint: disable=E1101 _ = recover_export_model(module, current_key_name) @@ -154,7 +158,7 @@ def recover_export_model(model, current_key_name=None): def build_woq_model(model, quantization_config): from neural_compressor.adaptor.torch_utils.util import set_module - + weight_dtype = quantization_config.weight_dtype for n, m in model.named_modules(): if "lm_head" in n or "output_layer" in n or "embed_out" in n: continue @@ -164,17 +168,19 @@ def build_woq_model(model, quantization_config): "zero_point", not getattr(quantization_config, "sym", False), ) + dtype = "int4" if weight_dtype == "int4_clip" else weight_dtype + use_optimum_format = False if weight_dtype in ["nf4", "fp4", "fp4_e2m1"] else True with init_empty_weights(): new_module = WeightOnlyLinear( m.in_features, m.out_features, - dtype="int", + dtype=dtype, bits=quantization_config.bits, group_size=quantization_config.group_size, zp=zp, bias=m.bias is not None, g_idx=True, - use_optimum_format=True, + use_optimum_format=use_optimum_format, ) set_module(model, n, new_module) return model @@ -194,18 +200,10 @@ def convert_model_to_public(model): elif model.quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "nf4", - "fp4_e2m1", - "fp4_e2m1_bnb", ]: model = recover_export_model(model) -def make_contiguous(model): - for param in model.parameters(): - if param.data.ndimension() > 1: - param.data = param.data.contiguous() - def save_low_bit( self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs @@ -234,7 +232,7 @@ def save_low_bit( os.makedirs(save_directory, exist_ok=True) # use transformers original `save_pretrained` function del self.save_pretrained - make_contiguous(self) + self.save_pretrained( save_directory=save_directory, push_to_hub=push_to_hub, **kwargs ) @@ -733,7 +731,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: ) else convert_dtype_torch2str(torch_dtype) ), - weight_dtype="nf4" if use_cpu else "int4_fullrange", + weight_dtype="int4_clip" if use_cpu else "int4_fullrange", ) else: assert ( @@ -741,7 +739,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: and convert_dtype_str2torch(quantization_config.compute_dtype) == torch_dtype ), "Quantization_config.weight_dtype should be 'nf4' , 'int4', 'int4_fullrange', 'int4_clip', " - f"'fp4', 'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}." + f"'fp4', 'fp4_e2m1' and compute_dtype should be {torch_dtype}." elif load_in_8bit: if quantization_config is None: if use_neural_speed: @@ -856,6 +854,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: **kwargs, ) model.config.update({"low_cpu_mem_usage": False}) + quantization_config.post_init_xpu() else: kwargs["low_cpu_mem_usage"] = True config.torchscript = ( @@ -870,6 +869,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: **kwargs, ) model.config.update({"low_cpu_mem_usage": True}) + quantization_config.post_init_cpu() model.eval() if use_xpu: @@ -1807,7 +1807,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): raise ValueError( f"weight_dtype must be a string in " f"'int8', 'int4', 'int4_fullrange', 'int4_clip', 'nf4', " - f"'fp4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8', 'fp8_e5m2, fp8_e4m3'" + f"'fp4', 'fp4_e2m1', 'fp8', 'fp8_e5m2, fp8_e4m3'" ) else: logger.info( @@ -1825,9 +1825,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "fp4_e2m1", - "fp4_e2m1_bnb", - "nf4", ]: model = build_woq_model(model, quantization_config) else: @@ -1944,9 +1941,6 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]): if quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", - "nf4", - "fp4_e2m1", - "fp4_e2m1_bnb", ] and not quantization_config.use_ipex: model = replace_linear( model, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 7d27377ff83..2bb25b791d7 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -296,7 +296,6 @@ def post_init_cpu(self): if self.bits == 4 and self.weight_dtype not in [ "int4_clip", "nf4", - "fp4_e2m1_bnb", "fp4_e2m1", ]: self.weight_dtype = "int4_clip" @@ -314,14 +313,13 @@ def post_init_cpu(self): "int8", "int4_clip", "nf4", - "fp4_e2m1_bnb", "fp4_e2m1", "fp8_e5m2", "fp8_e4m3", ]: raise ValueError( f"weight_dtype must be a string in " - f"'int8', 'int4', 'int4_clip', 'nf4', 'fp4', 'fp4_e2m1_bnb', 'fp4_e2m1', " + f"'int8', 'int4', 'int4_clip', 'nf4', 'fp4', 'fp4_e2m1', " f"'fp8', 'fp8_e5m2, fp8_e4m3'" ) @@ -1043,6 +1041,7 @@ def __init__( weight_dtype: Any = None, scale_dtype: Any = None, layer_wise: bool = False, + absorb_to_layer: dict = {}, n_samples: int = 128, seq_len: int = 2048, use_double_quant=False, @@ -1060,6 +1059,7 @@ def __init__( self.weight_dtype = weight_dtype self.scale_dtype = scale_dtype self.group_size = group_size + self.absorb_to_layer = absorb_to_layer self.sym = sym self.scheme = "sym" if self.sym else "asym" self.layer_wise = layer_wise diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index f7d767384f6..c53d139dd51 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -241,112 +241,114 @@ def test_quantization_for_llm(self): fp32_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, use_neural_speed=False) dummy_input = fp32_model.dummy_inputs["input_ids"] - # # Dynamic quant - # dq_config = DynamicQuantConfig() - # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=dq_config, - # ) - # q_model.eval() - # output = q_model(dummy_input) - # q_model.save_pretrained("./saved_results") - # output = q_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) - # q_model = AutoModelForCausalLM.from_pretrained("./saved_results" - # ) - # output = q_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) - # # Static quant - # sq_config = StaticQuantConfig( - # tokenizer=tokenizer, # either two of one, tokenizer or calib_func - # calib_iters=2, - # ) - # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=sq_config, - # ) - # q_model.eval() - # output = q_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) - # q_model.save_pretrained("./saved_results") - # loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") - # loading_model.eval() - # output = loading_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) - # # Quant aware training - # qat_config = QuantAwareTrainingConfig( - # tokenizer=tokenizer, # either two of one, tokenizer or train_func - # train_iters=2, - # ) - # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=qat_config, - # ) - # q_model.eval() - # output = q_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) - # q_model.save_pretrained("./saved_results") - # loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") - # loading_model.eval() - # output = loading_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) - # # Smoothquant - # sq_config = SmoothQuantConfig( - # tokenizer=tokenizer, # either two of one, tokenizer or calib_func - # calib_iters=2, - # ipex_opt_llm=False - # ) - # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=sq_config, - # use_neural_speed=False - # ) - # self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) - - # # Smoothquant auto - # recipes = { - # "smooth_quant": True, - # "smooth_quant_args": { "alpha": "auto", "auto_alpha_args":{"alpha_max": 0.6, - # "alpha_min":0.5, "alpha_step":0.1, "shared_criterion": "mean", "do_blockwise": False}}, - # } - # sq_config = SmoothQuantConfig( - # tokenizer=tokenizer, # either two of one, tokenizer or calib_func - # calib_iters=2, - # recipes=recipes, - # ipex_opt_llm=False - # ) - # q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=sq_config, - # use_neural_speed=False - # ) - # self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) - - # # weight-only - # # RTN - # woq_config = RtnConfig(bits=4) - # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=woq_config, - # use_neural_speed=False - # ) - # woq_model.eval() - # output = woq_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047, rel_tol=1e-04)) + # Dynamic quant + dq_config = DynamicQuantConfig() + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=dq_config, + ) + q_model.eval() + output = q_model(dummy_input) + q_model.save_pretrained("./saved_results") + output = q_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) + q_model = AutoModelForCausalLM.from_pretrained("./saved_results") + output = q_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17140813171863556, rel_tol=1e-04)) + # Static quant + sq_config = StaticQuantConfig( + tokenizer=tokenizer, # either two of one, tokenizer or calib_func + calib_iters=2, + ) + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=sq_config, + ) + q_model.eval() + output = q_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) + q_model.save_pretrained("./saved_results") + loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") + loading_model.eval() + output = loading_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17378684878349304, rel_tol=1e-04)) + # Quant aware training + qat_config = QuantAwareTrainingConfig( + tokenizer=tokenizer, # either two of one, tokenizer or train_func + train_iters=2, + ) + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=qat_config, + ) + q_model.eval() + output = q_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) + q_model.save_pretrained("./saved_results") + loading_model = AutoModelForCausalLM.from_pretrained("./saved_results") + loading_model.eval() + output = loading_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17362995445728302, rel_tol=1e-04)) + # Smoothquant + sq_config = SmoothQuantConfig( + tokenizer=tokenizer, # either two of one, tokenizer or calib_func + n_samples=2, + ipex_opt_llm=False + ) + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=sq_config, + ) + self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) + + # Smoothquant auto + sq_config = SmoothQuantConfig( + tokenizer=tokenizer, # either two of one, tokenizer or calib_func + n_samples=2, + alpha="auto", + alpha_max=0.6, + alpha_min=0.5, + alpha_step=0.1, + shared_criterion="mean", + do_blockwise=False, + ipex_opt_llm=False + ) + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=sq_config, + ) + self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) - # # AWQ - # woq_config = AwqConfig(bits=4, - # zero_point=False, - # calib_iters=5, - # tokenizer=tokenizer - # ) + # weight-only + # RTN + woq_config = RtnConfig(bits=4) + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=woq_config, + use_neural_speed=False + ) + woq_model.eval() + output = woq_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047, rel_tol=1e-04)) + + # AWQ + woq_config = AwqConfig(bits=4, + zero_point=False, + n_samples=5, + batch_size=1, + seq_len=512, + tokenizer=tokenizer + ) - # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=woq_config, - # use_neural_speed=False - # ) - # woq_model.eval() - # output = woq_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.18019595742225647 , rel_tol=1e-04)) + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=woq_config, + use_neural_speed=False + ) + woq_model.eval() + output = woq_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.20071472227573395 , rel_tol=1e-04)) # # TEQ + # need INC fix. # woq_config = TeqConfig(bits=4, - # calib_iters=5, - # tokenizer=tokenizer, + # n_samples=5, + # batch_size=1, + # seq_len=512, + # tokenizer=tokenizer # ) # woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, # quantization_config=woq_config, @@ -355,16 +357,17 @@ def test_quantization_for_llm(self): # woq_model.eval() # output = woq_model(dummy_input) - # # fp8 - # woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") - # woq_model = AutoModelForCausalLM.from_pretrained( - # model_name_or_path, quantization_config=woq_config, use_neural_speed=False - # ) - # woq_model.eval() - # output = woq_model(dummy_input) - # self.assertTrue( - # isclose(float(output[0][0][0][0]), 0.16162332892417908, rel_tol=1e-04) - # ) + + # fp8 + woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") + woq_model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, quantization_config=woq_config, use_neural_speed=False + ) + woq_model.eval() + output = woq_model(dummy_input) + self.assertTrue( + isclose(float(output[0][0][0][0]), 0.16162332892417908, rel_tol=1e-04) + ) # # amp # amp_config = MixedPrecisionConfig() @@ -376,24 +379,24 @@ def test_quantization_for_llm(self): # output = amp_model(dummy_input) # self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) - # # load_in_4bit - # bit4_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # load_in_4bit=True, - # use_neural_speed=False - # ) - # bit4_model.eval() - # output = bit4_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.18726778030395508, rel_tol=1e-04)) - - # # load_in_8bit - # bit8_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # load_in_8bit=True, - # use_neural_speed=False, - # device_map="cpu" - # ) - # bit8_model.eval() - # output = bit8_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.16759155690670013, rel_tol=1e-04)) + # load_in_4bit + bit4_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + load_in_4bit=True, + use_neural_speed=False + ) + bit4_model.eval() + output = bit4_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047, rel_tol=1e-04)) + + # load_in_8bit + bit8_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + load_in_8bit=True, + use_neural_speed=False, + device_map="cpu" + ) + bit8_model.eval() + output = bit8_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.16759155690670013, rel_tol=1e-04)) # GPTQ woq_config = GPTQConfig(bits=4, @@ -402,8 +405,8 @@ def test_quantization_for_llm(self): desc_act=False, damp_percent=0.01, blocksize=32, - nsamples=3, - max_input_length=256, + n_samples=3, + seq_len=256, tokenizer=tokenizer, batch_size=1 ) @@ -413,13 +416,13 @@ def test_quantization_for_llm(self): ) woq_model.eval() output = woq_model(dummy_input) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17126554250717163, rel_tol=1e-04)) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.1800851970911026, rel_tol=1e-04)) # AUTOROUND woq_config = AutoRoundConfig(bits=4, - weight_dtype="int4_clip", - nsamples=128, - calib_len=32, + weight_dtype="int4_clip", + n_samples=128, + seq_len=32, iters=5, tokenizer=tokenizer ) @@ -430,7 +433,7 @@ def test_quantization_for_llm(self): woq_model.eval() output = woq_model(dummy_input) if CpuInfo().bf16: - self.assertTrue(isclose(float(output[0][0][0][0]), 0.169921875, rel_tol=1e-04)) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.1513671875, rel_tol=1e-04)) def test_export(self): # test model with model_id diff --git a/tests/CI/test_weight_only.py b/tests/CI/test_weight_only.py index bca5f2ba169..118a03d094d 100644 --- a/tests/CI/test_weight_only.py +++ b/tests/CI/test_weight_only.py @@ -41,6 +41,11 @@ from intel_extension_for_transformers.transformers.llm.utils.generation import _beam_search, _greedy_search from intel_extension_for_transformers.transformers import RtnConfig +import random +random.seed(1234) +torch.manual_seed(1234) +import numpy as np +np.random.seed(1234) class DummyDataset(data.Dataset): def __init__(self): @@ -130,29 +135,6 @@ def test_int8(self): print(output_quant) assert torch.allclose(output, output_quant, rtol=0.01) - def test_int4(self): - raw_wei = torch.rand(2, 32, dtype=torch.float) - compress_wei = qbits.quantize_to_packed_weight( - raw_wei, True, 32, "fp32", "nf4", "fp32", False) - revert_wei = torch.zeros(2, 32, dtype=torch.float) - qbits.dequantize_packed_weight(compress_wei, revert_wei, True, - "fp32", "nf4", "fp32") - for bias in [True, False]: - model = M(with_bias=bias) - with torch.no_grad(): - model.linear.weight = torch.nn.Parameter(revert_wei) - activation = torch.rand(1, 5, 32, dtype=torch.float) - output = model(activation) - with torch.no_grad(): - model.linear.weight = torch.nn.Parameter(raw_wei) - config = RtnConfig( - bits=4, weight_dtype="nf4", group_size=32) - convert_to_quantized_model(model, config) - output_quant = model(activation) - print(output) - print(output_quant) - assert torch.allclose(output, output_quant, rtol=0.01) - def test_woq_with_ipex_cpu(self): model_name_or_path = "facebook/opt-125m" config = RtnConfig(bits=4, use_ipex=True) diff --git a/tests/CI/test_weight_only_gpu.py b/tests/CI/test_weight_only_gpu.py index a73715a222e..b34d30a6b83 100644 --- a/tests/CI/test_weight_only_gpu.py +++ b/tests/CI/test_weight_only_gpu.py @@ -23,7 +23,7 @@ from intel_extension_for_transformers.transformers import GPTQConfig, RtnConfig from math import isclose from transformers import AutoTokenizer -from intel_extension_for_transformers.tools.utils import is_intel_gpu_available, is_ipex_available +from intel_extension_for_transformers.tools.utils import is_intel_gpu_available, is_ipex_available, _ipex_version from torch.utils.data import DataLoader @@ -68,7 +68,7 @@ def forward(self, x): return self.linear(x) -@unittest.skipIf(not is_ipex_available() or not is_intel_gpu_available(), +@unittest.skipIf(not is_ipex_available() or not _ipex_version >= "2.3.10" or not is_intel_gpu_available(), "There is no Intel GPU in this machine, skip this test!") class TestArcWeightOnly(unittest.TestCase): From daa485c50a40765a990c1cfd13441235480b8446 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 10 Jul 2024 21:12:32 -0700 Subject: [PATCH 28/28] remove engine ci and neuralchat ci Signed-off-by: changwangss --- .github/checkgroup.yml | 34 ------------------- .../quantization/run_generation_sq.py | 4 +++ .../transformers/llm/quantization/sq_utils.py | 7 ++-- .../transformers/llm/quantization/utils.py | 21 ++++-------- .../transformers/utils/config.py | 17 ++++++++-- tests/CI/test_quantization.py | 18 +++++----- tests/CI/test_weight_only.py | 4 ++- 7 files changed, 41 insertions(+), 64 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 57c7ab30a60..e1f6b0c3735 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -30,40 +30,6 @@ subprojects: - "optimize-unit-test-PR-test" - "Genreate-OptimizeUT-Report" - - id: "NeuralChat Unit Test" - paths: - - ".github/workflows/unit-test-neuralchat.yml" - - ".github/workflows/script/unitTest/run_unit_test_neuralchat.sh" - - "intel_extension_for_transformers/neural_chat/**" - - "requirements.txt" - - "setup.py" - - "intel_extension_for_transformers/transformers/llm/finetuning/**" - - "intel_extension_for_transformers/transformers/llm/quantization/**" - - "intel_extension_for_transformers/transformers/**" - - "intel_extension_for_transformers/langchain/**" - - "!intel_extension_for_transformers/neural_chat/docs/**" - - "!intel_extension_for_transformers/neural_chat/examples/**" - - "!intel_extension_for_transformers/neural_chat/assets/**" - - "!intel_extension_for_transformers/neural_chat/README.md" - checks: - - "neuralchat-unit-test-baseline" - - "neuralchat-unit-test-PR-test" - - "Generate-NeuralChat-Report" - - - id: "Engine Unit Test workflow" - paths: - - ".github/workflows/unit-test-engine.yml" - - "requirements.txt" - - "setup.py" - - intel_extension_for_transformers/transformers/** - - "intel_extension_for_transformers/transformers/runtime/**" - - "!intel_extension_for_transformers/transformers/runtime/kernels/**" - - "!intel_extension_for_transformers/transformers/runtime/third_party/**" - - "!intel_extension_for_transformers/transformers/runtime/docs/**" - checks: - - "engine-unit-test-baseline" - - "engine-unit-test-PR-test" - - "Genreate-Engine-Report" # - id: "Windows Binary Test" # paths: diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py index c3f7afed86f..fd727af4d53 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_sq.py @@ -67,6 +67,8 @@ "--seq_len", default=512, type=int, help="Smooth quant calibration input length." ) parser.add_argument("--batch_size", default=1, type=int, help="batch size num.") +parser.add_argument("--padding", action="store_true") +parser.add_argument("--shuffle", action="store_true") # sq alpha "auto" parameters parser.add_argument("--scale_sharing", action="store_true") parser.add_argument( @@ -150,6 +152,8 @@ alpha_step=args.alpha_step, shared_criterion=args.shared_criterion, do_blockwise=args.do_blockwise, + shuffle=args.shuffle, + padding=args.padding, num_beams=generate_kwargs["num_beams"], ) else: diff --git a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py index 27cfb81764b..7d5e52c21ee 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/sq_utils.py @@ -229,10 +229,11 @@ def get_dataloader( model_type, quantization_config, past_key_values, - shuffle=False, - padding=False, - seq_len=512, ): + shuffle=quantization_config.shuffle + padding=quantization_config.padding + seq_len=quantization_config.seq_len + calib_dataset = load_dataset( quantization_config.dataset, split=( diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 58dcf2edd59..0678c2eb72e 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -271,8 +271,9 @@ def _replace_linear( quantization_config.weight_dtype not in [ "fp8_e5m2", "fp8_e4m3", + "nf4", + "fp4_e2m1", ] - model._modules[name] = QuantizedLinearQBits( in_features, out_features, @@ -684,6 +685,8 @@ def convert_to_quantized_model(model, config, device="cpu"): logger.warning("The recommended ipex version is higher than 2.3.10 for xpu device.") model.eval() + # INC attribute conflicted with transformers when use nf4/int8 training. + del model.is_quantized q_model = replace_linear(model, None, None, config, device=device) if orig_dtype != torch.float32: @@ -764,19 +767,9 @@ def convert_to_smoothquant_model(model, quantization_config): config=model.config, input_bs=1 ) # get calibration dataloader - if quantization_config.alpha == "auto" and model_type == "llama": - calib_dataloader = get_dataloader( - model_type, - quantization_config, - past_key_values=past_key_values, - shuffle=True, - padding=True, - seq_len=quantization_config.seq_len, - ) - else: - calib_dataloader = get_dataloader( - model_type, quantization_config, past_key_values=past_key_values - ) + calib_dataloader = get_dataloader( + model_type, quantization_config, past_key_values=past_key_values + ) def calib_func(model): with torch.no_grad(): diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 2bb25b791d7..72953c95515 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -30,9 +30,6 @@ torch = LazyImport("torch") -@dataclass -class MixedPrecisionConfig: - dtype: str = "bfloat16" if transformers.__version__ >= "4.32.0": @@ -56,8 +53,18 @@ class QuantizationMethod(str, Enum): STATIC = "static" SmoothQuant = "sq" QuantAwareTraining = "qat" + MixedPrecision = "mp" +class MixedPrecisionConfig(QuantizationConfig): + + quant_method = QuantizationMethod.MixedPrecision + def __init__( + self, + dtype = "bfloat16" + ): + self.dtype = dtype + class SparsityConfig(PretrainedConfig): def __init__( self, @@ -758,6 +765,8 @@ def __init__( excluded_precisions=[], ipex_opt_llm=None, num_beams=1, + shuffle=False, + padding=False, **kwargs, ): self.quant_method = QuantizationMethod.SmoothQuant @@ -776,6 +785,8 @@ def __init__( self.seq_len = seq_len self.ipex_opt_llm = ipex_opt_llm self.num_beams = num_beams + self.shuffle = shuffle + self.padding = padding self.excluded_precisions = excluded_precisions self.batch_size = kwargs.pop("batch_size", 1) diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index c53d139dd51..264d924efad 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -369,15 +369,15 @@ def test_quantization_for_llm(self): isclose(float(output[0][0][0][0]), 0.16162332892417908, rel_tol=1e-04) ) - # # amp - # amp_config = MixedPrecisionConfig() - # amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=amp_config, - # use_neural_speed=False - # ) - # amp_model.eval() - # output = amp_model(dummy_input) - # self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) + # amp + amp_config = MixedPrecisionConfig() + amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=amp_config, + use_neural_speed=False + ) + amp_model.eval() + output = amp_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) # load_in_4bit bit4_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, diff --git a/tests/CI/test_weight_only.py b/tests/CI/test_weight_only.py index 118a03d094d..eb73bb96e5e 100644 --- a/tests/CI/test_weight_only.py +++ b/tests/CI/test_weight_only.py @@ -129,6 +129,7 @@ def test_int8(self): output = model(activation) config = RtnConfig(bits=8, weight_dtype="int8", group_size=32) + config.post_init_cpu() convert_to_quantized_model(model, config) output_quant = model(activation) print(output) @@ -208,8 +209,9 @@ def test_auto_model_saving_loading(self): self.assertTrue(len(module_list) > 0) def test_nf4_training(self): + quantization_config = RtnConfig(bits=4, weight_dtype="nf4", scale_dtype="fp32") model = AutoModelForCausalLM.from_pretrained( - llama_model_path, load_in_4bit=True, use_neural_speed=False) + llama_model_path, quantization_config=quantization_config, use_neural_speed=False) peft_config = LoraConfig( r=8, lora_alpha=16,