From c3203f7f386a217c90c6b3481c65df153149ecc5 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 16 May 2024 12:03:51 +0800 Subject: [PATCH 01/21] update oneDNN to 2fa152e201 on main (#2889) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 417f0c7a9..0430e47c6 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 417f0c7a91aaf3c1566ace488fc825bc3719abbd +Subproject commit 0430e47c6b2704627977b99ab5556aa0ba6908ce From a5793e1104c7544aa9e74a6b8bedffb9c8d90524 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Thu, 16 May 2024 20:47:36 +0800 Subject: [PATCH 02/21] Xu add lintrunner (#2890) * add lintrunner config. * update lintrunner init version. * update python code check guide. * remove flake8 install requirement file. --- .lintrunner.toml | 31 +++++++++++++++++++++ CONTRIBUTING.md | 4 ++- scripts/tools/setup/requirements-flake8.txt | 10 ------- 3 files changed, 34 insertions(+), 11 deletions(-) create mode 100644 .lintrunner.toml delete mode 100644 scripts/tools/setup/requirements-flake8.txt diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 000000000..fed5d458c --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,31 @@ +[[linter]] +code = 'FLAKE8' +include_patterns = ['*.py'] +exclude_patterns = [ + '.git/**', +] +command = [ + 'python3', + 'scripts/tools/setup/flake8.py', + '--', + '@{{PATHSFILE}}' +] + +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + 'flake8==3.8.2', + 'flake8-bugbear==20.1.4', + 'flake8-comprehensions==3.3.0', + 'flake8-executable==2.0.4', + # 'git+https://github.com/malfet/flake8-coding.git', + 'flake8-pyi==20.5.0', + 'mccabe==0.6.1', + 'pycodestyle==2.6.0', + 'pyflakes==2.2.0', + 'black==24.3.0', +] \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 795728fd7..d888a816c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -180,7 +180,9 @@ For example, if you wanted to run the test `MayContainAlias`, which is part of t ### Python Code We can find python code style utils in `scripts/tools/setup` folder. Please install the related dependency python modules: ```bash -pip install -r scripts/tools/setup/requirements-flake8.txt +pip install lintrunner +pip install lintrunner-adapters +lintrunner init ``` Please run flake8.py to auto-format python code and check the python code style. The script will return results, please manual modify code follow the output information, and until it shows pass: ```bash diff --git a/scripts/tools/setup/requirements-flake8.txt b/scripts/tools/setup/requirements-flake8.txt deleted file mode 100644 index 5689e9542..000000000 --- a/scripts/tools/setup/requirements-flake8.txt +++ /dev/null @@ -1,10 +0,0 @@ -flake8==3.8.2 -flake8-bugbear==20.1.4 -flake8-comprehensions==3.3.0 -flake8-executable==2.0.4 -git+https://github.com/malfet/flake8-coding.git -flake8-pyi==20.5.0 -mccabe -pycodestyle==2.6.0 -pyflakes==2.2.0 -black==24.3.0 From a94cdbd106b6a7751bb95eb51ebfcb1f8ea0a81c Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Fri, 17 May 2024 09:15:38 +0800 Subject: [PATCH 03/21] Update dependency_version.yml 20240517 (#2895) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index fddc3e804..56bca9060 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240514+cpu + version: 2.4.0.dev20240516+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240514+cpu + version: 2.2.0.dev20240516+cpu torchvision: - version: 0.19.0.dev20240514+cpu + version: 0.19.0.dev20240516+cpu transformers: version: 4.38.1 From 6eb32019c650c92b4e93f3d56b1f5d5e7885d927 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Mon, 20 May 2024 09:23:43 +0800 Subject: [PATCH 04/21] Update dependency_version.yml 20240520 (#2904) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 56bca9060..b258442c5 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240516+cpu + version: 2.4.0.dev20240519+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240516+cpu + version: 2.2.0.dev20240519+cpu torchvision: - version: 0.19.0.dev20240516+cpu + version: 0.19.0.dev20240519+cpu transformers: version: 4.38.1 From 34a349e456023b5e9ba13815b31d2c1bad3bdeaf Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Mon, 20 May 2024 12:44:24 +0800 Subject: [PATCH 05/21] Fix DDP resume training (#2902) * refine state_dict in ipex.optimize * remove debug print * fix ut --- .../nn/utils/_parameter_wrapper.py | 149 +++++++----------- .../nn/utils/_weight_prepack.py | 7 +- tests/cpu/ipex-optimize-ddp-static-graph.py | 57 +++++++ tests/cpu/test_ipex_optimize.py | 44 ++++++ 4 files changed, 156 insertions(+), 101 deletions(-) create mode 100644 tests/cpu/ipex-optimize-ddp-static-graph.py diff --git a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py index 25d10ba77..70d0e1769 100644 --- a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py +++ b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py @@ -245,23 +245,56 @@ def remove_empty_tensor(out): return out +def found_wrapper(parameter, params_attr): + for _, v in params_attr.items(): + if parameter is v.parameter: + return v + return None + + def patch_state_dict(model, params_attr, mode): - def cast_back_state_dict(self, *args, destination=None, prefix="", keep_vars=False): - with torch.no_grad(), contextlib.ExitStack() as stack: - for v in params_attr.values(): - if mode == "inference": - stack.enter_context(v.inference_cast_save()) + def get_parammeter_from_model(model, name_list): + if name_list[0] == "module" and not hasattr(model, "module"): + # for DDP model, there is an extra module + name_list = name_list[1:] + model_or_param = model + for attr in name_list: + model_or_param = getattr(model_or_param, attr) + return model_or_param + + def to_public_fp32(model, state_dict, params_attr): + data_ptr_dict = {} + for k, v in state_dict.items(): + v_ptr = v.data_ptr() + if v_ptr in data_ptr_dict: + # use cached tensor for multiple parameters share same tensor data + state_dict[k] = data_ptr_dict[v_ptr] + continue + # k = "submodule_name.submodule_name.attr_name" + # for example, "attn.linear.weight" + name_list = k.split(".") + param = get_parammeter_from_model(model, name_list) + param_wrapper = found_wrapper(param, params_attr) + if param_wrapper: + if mode == "inference" and param_wrapper.original_dtype is not None: + state_dict[k] = v.to(param_wrapper.original_dtype) elif mode == "training": - stack.enter_context(v.training_cast_save()) + state_dict[k] = param_wrapper._training_cast_to_fp32() else: assert mode == "prepack" - stack.enter_context(v.prepack_cast_save()) - out = self._original_state_dict( + state_dict[k] = param_wrapper._unpack_cast_to_fp32() + data_ptr_dict[v_ptr] = state_dict[k] + return state_dict + + def cast_back_state_dict(self, *args, destination=None, prefix="", keep_vars=False): + with torch.no_grad(), contextlib.ExitStack() as stack: + state_dict = self._original_state_dict( *args, destination=destination, prefix=prefix, keep_vars=keep_vars ) # We don't save the _ipex_module_empty_weight_tensor or _ipex_module_empty_bias_tensor Parameter in the state dict - out = remove_empty_tensor(out) - return out + state_dict = remove_empty_tensor(state_dict) + state_dict = to_public_fp32(self, state_dict, params_attr) + return state_dict if not hasattr(model, "_original_state_dict"): setattr(model, "_original_state_dict", model.state_dict) # noqa: B010 @@ -362,52 +395,9 @@ def cast_for_training(self, dtype, split): requires_grad=self.master_parameter.requires_grad, ) - def inference_cast_save(self): - @contextlib.contextmanager - def ctx(): - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.original_dtype) - try: - yield - finally: - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.casted_dtype) - - return ctx() - - def training_cast_save(self): - @contextlib.contextmanager - def ctx(): - self._training_cast_before_save() - try: - yield - finally: - self._training_cast_after_save() - - return ctx() - - def prepack_cast_save(self): - @contextlib.contextmanager - def ctx(): - self._cast_unpack_before_save() - try: - yield - finally: - self._cast_unpack_after_save() - - return ctx() - - def _inference_cast_before_save(self): - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.original_dtype) - - def _inference_cast_after_save(self): - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.casted_dtype) - - def _training_cast_before_save(self): + def _training_cast_to_fp32(self): if self.original_dtype is None: - return + return self.parameter.data.detach() assert self.original_dtype in ( torch.float, torch.float32, @@ -417,51 +407,20 @@ def _training_cast_before_save(self): fp32_param = torch.ops.torch_ipex.cat_bfloat16_float( self.parameter.data, self.parameter_trail ) - with torch.no_grad(): - self.parameter.data = fp32_param - else: - # will save parameter for non-split case - with torch.no_grad(): - self.parameter.data = self.master_parameter.data - - def _training_cast_after_save(self): - if self.original_dtype is None: - return - if self.split: - assert self.casted_dtype == torch.bfloat16 - top, self.parameter_trail = torch.ops.torch_ipex.split_float_bfloat16( - self.parameter.data - ) - with torch.no_grad(): - self.parameter.data = top + return fp32_param.detach() else: - self.parameter.data = self.master_parameter.data.to(self.casted_dtype) + return self.master_parameter.data.detach() - def _cast_unpack_before_save(self): + def _unpack_cast_to_fp32(self): + fp32_param = self.parameter.data if self.split is not None: - self._training_cast_before_save() + fp32_param = self._training_cast_to_fp32() elif self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.original_dtype) + fp32_param = self.parameter.to(self.original_dtype) if self.op_ctx is None: - return - with torch.no_grad(): - if self.master_parameter is not None: - self.parameter.data = self.op_ctx.to_public(self.master_parameter) - else: - self.parameter.data = self.op_ctx.to_public(self.parameter) - - def _cast_unpack_after_save(self): - if self.split is not None: - self._training_cast_after_save() - elif self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.casted_dtype) - if self.op_ctx is None: - return - with torch.no_grad(): - if self.master_parameter is None: - self.parameter.data = self.op_ctx.pack(self.parameter) - if self.parameter_trail is not None: - self.parameter_trail = self.op_ctx.pack(self.parameter_trail) + return fp32_param + else: + return self.op_ctx.to_public(fp32_param) def can_prepack(self, module, is_training): if self.num_modules != 1: diff --git a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py index 67725ac2e..604232ecd 100644 --- a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py +++ b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py @@ -430,18 +430,13 @@ def weight_prepack_with_ipex(model, optimizer, params_attr, device_type="cpu"): patch_state_dict, get_shared_parameter_status, IPEX_WEIGHT_PREPACK_MODULE_CPU, + found_wrapper, ) is_training = optimizer is not None if len(params_attr) == 0: get_shared_parameter_status(model, params_attr) - def found_wrapper(parameter, params_attr): - for _, v in params_attr.items(): - if parameter is v.parameter: - return v - return None - def convert(m, optimizer, params_attr): # already packed for reentrancy test if m.__class__ in IPEX_WEIGHT_PREPACK_MODULE_CPU().values(): diff --git a/tests/cpu/ipex-optimize-ddp-static-graph.py b/tests/cpu/ipex-optimize-ddp-static-graph.py new file mode 100644 index 000000000..62937b4ca --- /dev/null +++ b/tests/cpu/ipex-optimize-ddp-static-graph.py @@ -0,0 +1,57 @@ +import torch +import os +import intel_extension_for_pytorch as ipex +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist +import argparse + + +class Module(torch.nn.Module): + def __init__( + self, + ): + super(Module, self).__init__() + self.linear = torch.nn.Linear(1024, 1024, bias=False) + + def forward(self, x): + return self.linear(x) + + +torch.manual_seed(10) +model = Module() +optim = torch.optim.SGD(model.parameters(), lr=1) + +opt_model, opt = ipex.optimize( + model, dtype=torch.bfloat16, optimizer=optim, inplace=False, weights_prepack=False +) + + +def env2int(env_list, default=-1): + for e in env_list: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +rank = env2int(["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0) + +os.environ["MASTER_ADDR"] = "127.0.0.1" +os.environ["MASTER_PORT"] = "29510" +dist.init_process_group("gloo", world_size=2, rank=rank) +my_rank = dist.get_rank() +parser = argparse.ArgumentParser() +parser.add_argument("--get-state-dict", action="store_true") +args = parser.parse_args() + +opt_model = DDP(opt_model, static_graph=True) +for i in range(10): + input = torch.randn(1024, 1024).bfloat16() + output = opt_model(input) + if i == 5 and my_rank == 0 and args.get_state_dict: + state_dict = opt_model.state_dict() + loss = output.sum() + loss.backward() + opt.step() + if i == 9: + print(f"Resume training successfully, final lose = {loss.item()}") diff --git a/tests/cpu/test_ipex_optimize.py b/tests/cpu/test_ipex_optimize.py index 95e3cbf8f..fc2b2b02e 100644 --- a/tests/cpu/test_ipex_optimize.py +++ b/tests/cpu/test_ipex_optimize.py @@ -24,6 +24,7 @@ from common_utils import TestModule, _empty_weight_bias_parameter_names from intel_extension_for_pytorch.optim._lamb import Lamb import os +import subprocess try: import transformers @@ -914,6 +915,49 @@ def run_and_recursively_call_ipex_optimize( graph_mode, ) + def test_ddp_strict_graph(self): + # check if the model can be trained with DDP in strict graph mode + # with calling "statce_dict" during training, and also check there + # is no difference for final lose between two training. + def get_loss(line): + loss = line.split(" = ")[-1] + if loss.endswith("]"): + loss = loss[:-3] + return float(loss) + + num = 0 + loc = os.path.dirname(os.path.abspath(__file__)) + loss = -1 + with subprocess.Popen( + "python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1" + + f" --nproc_per_node=2 --distributed --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py --get-state-dict", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) as p: + for line in p.stdout.readlines(): + line = str(line, "utf-8").strip() + if "Resume training successfully" in line: + loss = get_loss(line) + num = num + 1 + assert num == 2, "training not finished." + + num = 0 + with subprocess.Popen( + "python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1" + + f" --nproc_per_node=2 --distributed --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) as p: + for line in p.stdout.readlines(): + line = str(line, "utf-8").strip() + if "Resume training successfully" in line: + loss_ = get_loss(line) + self.assertEqual(loss_, loss) + num = num + 1 + assert num == 2, "training not finished." + if __name__ == "__main__": test = unittest.main() From bb14ce18b9d6504f122e3d1a12aebe6c966111de Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Mon, 20 May 2024 22:18:19 +0800 Subject: [PATCH 06/21] Refine tpp flags in IPEX linear (#2906) --- csrc/cpu/aten/AddLayerNorm.cpp | 29 ++++- csrc/cpu/aten/AddLayerNorm.h | 10 ++ .../inference/python/llm-modeling/README.md | 5 +- .../cpu/inference/python/llm-modeling/run.py | 3 +- examples/cpu/training/llm/README.md | 7 +- .../cpu/training/llm/templates/alpaca.json | 6 - .../llm/functional/__init__.py | 4 + .../llm/functional/fusions.py | 87 +++++++++++++ .../llm/functional/utils.py | 20 +++ .../llm/modules/mha_fusion.py | 19 +-- .../nn/utils/_parameter_wrapper.py | 1 + .../models/cpu/fusions/mha_fusion.py | 58 +++++++++ tests/cpu/test_ipex_llm_module.py | 116 ++++++++++++++++++ tests/cpu/test_tpp_linear.py | 20 +++ 14 files changed, 367 insertions(+), 18 deletions(-) delete mode 100644 examples/cpu/training/llm/templates/alpaca.json create mode 100644 intel_extension_for_pytorch/llm/functional/utils.py diff --git a/csrc/cpu/aten/AddLayerNorm.cpp b/csrc/cpu/aten/AddLayerNorm.cpp index 22c400bfb..fe5415a80 100644 --- a/csrc/cpu/aten/AddLayerNorm.cpp +++ b/csrc/cpu/aten/AddLayerNorm.cpp @@ -4,7 +4,7 @@ // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp #include "AddLayerNorm.h" - +#include #include namespace torch_ipex { @@ -57,5 +57,32 @@ at::Tensor dil_add_layernorm( return at::layer_norm(add_res, normalized_shape, weight_opt, bias_opt, eps); } } + +// register as a python op +at::Tensor add_layernorm( + const at::Tensor& a, + const at::Tensor& b, + int64_t alpha, + at::IntArrayRef normalized_shape, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + double eps) { + RECORD_FUNCTION("add_layernorm", c10::ArrayRef({})); + return dil_add_layernorm( + a, b, alpha, normalized_shape, weight_opt, bias_opt, eps, false); +} + } // namespace cpu } // namespace torch_ipex + +namespace { + +TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { + m.def( + "add_layernorm(Tensor a, Tensor b, int alpha, int[] normalized_shape, Tensor ? weight_opt, \ + Tensor ? bias_opt, float eps) -> Tensor"); + m.impl( + "add_layernorm", c10::DispatchKey::CPU, torch_ipex::cpu::add_layernorm); +} + +} // namespace \ No newline at end of file diff --git a/csrc/cpu/aten/AddLayerNorm.h b/csrc/cpu/aten/AddLayerNorm.h index 67b26a00f..76b74a421 100644 --- a/csrc/cpu/aten/AddLayerNorm.h +++ b/csrc/cpu/aten/AddLayerNorm.h @@ -81,6 +81,16 @@ at::Tensor dil_add_layernorm( float eps, bool cuda_enable); +// register as a python op +at::Tensor add_layernorm( + const at::Tensor& a, + const at::Tensor& b, + int64_t alpha, + at::IntArrayRef normalized_shape, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + double eps); + namespace { at::Tensor add_layer_norm_kernel_impl( diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md index 2587a188c..694b7b605 100644 --- a/examples/cpu/inference/python/llm-modeling/README.md +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -37,7 +37,10 @@ ipex.llm.functional.rms_norm ipex.llm.functional.fast_layer_norm ipex.llm.functional.indirect_access_kv_cache_attention ipex.llm.functional.varlen_attention - +ipex.llm.functional.add_layer_norm +ipex.llm.functional.add_rms_norm +ipex.llm.functional.silu_mul +ipex.llm.functional.gelu_mul ``` ### Generation related fusions diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py index 2090797eb..6a34ca845 100644 --- a/examples/cpu/inference/python/llm-modeling/run.py +++ b/examples/cpu/inference/python/llm-modeling/run.py @@ -5,7 +5,6 @@ import argparse from transformers import ( AutoTokenizer, - LlamaTokenizer, AutoModelForCausalLM, ) import transformers @@ -22,7 +21,7 @@ MODEL_CLASSES = { "gpt-j": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/training/llm/README.md b/examples/cpu/training/llm/README.md index 50dad1f90..79b18ce4d 100644 --- a/examples/cpu/training/llm/README.md +++ b/examples/cpu/training/llm/README.md @@ -29,8 +29,13 @@ export HOSTFILE=hostfile # Quick Start Scripts ## Run the model ``` -# Get the dataset here https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json +# Get the dataset here: https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json export DATASET="./alpaca_data.json" + +# Get the dataset template here: https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json +mkdir ./templates +mv alpaca.json ./templates + # Env vars export LOCAL_BATCH_SIZE=32 #32 is default one, you can choose per need export MODEL_NAME_OR_PATH="YOUR LOCAL PATH or MODEL_ID (HF)" diff --git a/examples/cpu/training/llm/templates/alpaca.json b/examples/cpu/training/llm/templates/alpaca.json deleted file mode 100644 index e486439c4..000000000 --- a/examples/cpu/training/llm/templates/alpaca.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "description": "Template used by Alpaca-LoRA.", - "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", - "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", - "response_split": "### Response:" -} diff --git a/intel_extension_for_pytorch/llm/functional/__init__.py b/intel_extension_for_pytorch/llm/functional/__init__.py index 82f0b5e45..023dd4832 100644 --- a/intel_extension_for_pytorch/llm/functional/__init__.py +++ b/intel_extension_for_pytorch/llm/functional/__init__.py @@ -4,4 +4,8 @@ fast_layer_norm, indirect_access_kv_cache_attention, varlen_attention, + add_layer_norm, + add_rms_norm, + silu_mul, + gelu_mul, ) diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 7251bc525..04f05dc49 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -8,6 +8,8 @@ VarlenAttention, ) +from .utils import _get_function_from_device + def rotary_embedding( query: torch.Tensor, @@ -209,3 +211,88 @@ def varlen_attention( return_softmax, gen_, ) + + +def silu_mul(x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None): + r""" + Applies PyTorch silu on input x, and them mul input y: + out = silu(x)*y + + Args: + x (torch.Tensor): input to apply silu. + y (torch.Tensor): input for mul to apply on silu(x). + out (torch.Tensor): buffer to get the results. + + """ + f = _get_function_from_device(x.device.type, silu_mul) + return f(x, y, out) + + +def gelu_mul( + x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None, approximate="none" +): + r""" + Applies PyTorch gelu on input x, and them mul input y: + out = gelu(x)*y + + Args: + x (torch.Tensor): input to apply gelu. + y (torch.Tensor): input for mul to apply on gelu(x). + out (torch.Tensor): buffer to get the results. + approximate (str): approximate config for gelu. + + """ + f = _get_function_from_device(x.device.type, gelu_mul) + return f(x, y, out, approximate) + + +def add_rms_norm( + residual: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool = False, +): + r""" + Add residual on input x and apply RMSnorm on the result. + + Args: + residual (torch.Tensor): residual to add with x. If residual is None, + it means only apply rmsnorm on x. + x (torch.Tensor) : the input tensor to add residual and apply RMSNorm. + weight (torch.Tensor): the weight to apply RMSnorm. + bias (torch.Tensor): the bias to apply RMSnorm. + eps (float) : the variance_epsilon to apply RMSnorm. + add_back (bool) : whether to store the result of (x + residual) back + to the residual buffer (if residual is not None). Default is False. + + """ + f = _get_function_from_device(x.device.type, add_rms_norm) + return f(residual, x, weight, bias, eps, add_back) + + +def add_layer_norm( + residual: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool = False, +): + r""" + Add residual on input x and apply layernorm on the result. + + Args: + residual (torch.Tensor): residual to add with x. If residual is None, + it means only apply layernorm on x. + x (torch.Tensor) : the input tensor to add residual and apply layernorm. + weight (torch.Tensor): the weight to apply layernorm. + bias (torch.Tensor): the bias to apply layernorm. + eps (float) : the variance_epsilon to apply layernorm. + add_back (bool) : whether to store the result of (x + residual) back + to the residual buffer (if residual is not None). Default is False. + + """ + f = _get_function_from_device(x.device.type, add_layer_norm) + return f(residual, x, weight, bias, eps, add_back) diff --git a/intel_extension_for_pytorch/llm/functional/utils.py b/intel_extension_for_pytorch/llm/functional/utils.py new file mode 100644 index 000000000..ba834a1c7 --- /dev/null +++ b/intel_extension_for_pytorch/llm/functional/utils.py @@ -0,0 +1,20 @@ +import sys +from intel_extension_for_pytorch.transformers.models.cpu.fusions.mha_fusion import ( # noqa F401 + silu_mul_cpu, + gelu_mul_cpu, + add_rms_norm_cpu, + add_layer_norm_cpu, +) + + +def _get_function_from_device(device_type: str, f): + assert device_type in [ + "cpu", + "xpu", + ], "The device is not in the supported device list." + target_f_name = f.__name__ + "_" + device_type + assert hasattr( + sys.modules[__name__], target_f_name + ), f"Target function {f.__name__} on {device_type} haven't implemented yet." + target_f = getattr(sys.modules[__name__], target_f_name) + return target_f diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index c3f410e63..940fea611 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -9,13 +9,15 @@ class RotaryEmbedding(nn.Module): [module init and forward] Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) on the `query ` or `key` before their multi-head attention computation. Args: - module init: - - max_position_embeddings (int): size (max) of the position embeddings. - - pos_embd_dim (int): dimension of the position embeddings. - - base (int) : Default: 10000. Base to generate the frequency of position embeddings. - - backbone (str): Default: None. The exact transformers model backbone - (e.g., "GPTJForCausalLM", get from model.config.architectures[0], - see https://huggingface.co/EleutherAI/gpt-j-6b/blob/main/config.json#L4). + max_position_embeddings (int): size (max) of the position embeddings. + pos_embd_dim (int): dimension of the position embeddings. + base (int) : Default: 10000. Base to generate the frequency of position embeddings. + backbone (str): Default: None. The exact transformers model backbone + (e.g., "GPTJForCausalLM", get from model.config.architectures[0], + see https://huggingface.co/EleutherAI/gpt-j-6b/blob/main/config.json#L4). + extra_rope_config (dict): like phi-3 model, it uses original_max_position_embeddings, + long_factor and short_factor, see details: + https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json#L23. forward: - input (torch.Tensor) : input to be applied with position embeddings, @@ -70,12 +72,14 @@ def __init__( pos_embd_dim: int, base=10000, backbone: str = None, + extra_rope_config: dict = None, ): super().__init__() self.model_backbone = backbone self.max_position_embeddings = max_position_embeddings self.pos_embd_dim = pos_embd_dim self.base = base + self.extra_rope_config = extra_rope_config def forward( self, @@ -107,6 +111,7 @@ def forward( self.pos_embd_dim, self.base, self.model_backbone, + self.extra_rope_config, ) return runtime_module( x, diff --git a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py index 70d0e1769..3e3894d01 100644 --- a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py +++ b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py @@ -546,6 +546,7 @@ def linear_prepack(self, module, is_training): if not hasattr(module, "out_features"): setattr(module, "out_features", module.weight.shape[0]) # noqa: B010 + module.tpp_fallback = False if module.use_tpp: from intel_extension_for_pytorch.nn.utils import ( Apply_TPPLinear_weight_prepack, diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index 0e01d94dc..2de1e0ddb 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -488,3 +488,61 @@ def forward( return_softmax, gen_, ) + + +def add_rms_norm_cpu( + add: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool, +): + assert bias is None, "bias is not supported in add_rmsnorm yet" + if add is not None: + if add_back: + add.add_(x) + input = add + else: + input = add + x + else: + input = x + + return torch.ops.torch_ipex.rmsnorm(input, weight, eps) + + +def add_layer_norm_cpu( + add: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool, +): + if add is not None: + out = torch.ops.torch_ipex.add_layernorm( + x, add, 1, [x.size(-1)], weight, bias, eps + ) + if add_back: + add.add_(x) + return out + else: + return torch.nn.functional.layer_norm( + x, [x.size(-1)], weight=weight, bias=bias, eps=eps + ) + + +@torch.compile(dynamic=True, options={"fx_graph_cache": True}) +def silu_mul_cpu(x, y, out=None): + res = torch.nn.functional.silu(x) * y + if out is not None: + out.copy_(res) + return res + + +@torch.compile(dynamic=True, options={"fx_graph_cache": True}) +def gelu_mul_cpu(x, y, out=None, approximate="none"): + res = torch.nn.functional.gelu(x, approximate=approximate) * y + if out is not None: + out.copy_(res) + return res diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py index f3ee00552..1674b2b01 100644 --- a/tests/cpu/test_ipex_llm_module.py +++ b/tests/cpu/test_ipex_llm_module.py @@ -148,6 +148,49 @@ def apply(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor): return torch.cat([c, d], dim=-1) +def add_rmsnorm(residual, x, weight, bias, eps, add_back): + orig_dtype = x.dtype + x = x.to(torch.float32) + if residual is not None: + x = residual + x + variance = x.pow(2).mean(dim=-1, keepdim=True) + out = x * torch.rsqrt(variance + eps) + out = out.to(orig_dtype) * weight + if add_back and residual is not None: + residual.copy_(x.to(orig_dtype)) + return out + + +def add_layernorm(residual, x, weight, bias, eps, add_back): + if residual is None: + return torch.nn.functional.layer_norm( + x, [x.size(-1)], weight=weight, bias=bias, eps=eps + ) + x = residual + x + out = torch.nn.functional.layer_norm( + x, [x.size(-1)], weight=weight, bias=bias, eps=eps + ) + if add_back: + residual.copy_(x) + return out + + +def silu_mul(x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty_like(x) + out = torch.nn.functional.silu(x) * y + return out + + +def gelu_mul( + x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None, approximate="none" +): + if out is None: + out = torch.empty_like(x) + out = torch.nn.functional.gelu(x, approximate=approximate) * y + return out + + class TestLLMModules(TestCase): def test_linearfusion_args0(self): x1 = torch.rand(1, 4, 4096) @@ -314,6 +357,79 @@ def test_rotary_embedding_tgi(self): self.assertEqual(ipex_q, ref_q) self.assertEqual(ref_k, ipex_k) + def test_add_layernorm(self): + for add_back in [True, False]: + for dtype in [torch.float, torch.bfloat16]: + for residual_is_none in [True, False]: + weight = torch.nn.Parameter(torch.randn(4096)).to(dtype) + eps = 1e-6 + x = torch.rand(1, 32, 4096).to(dtype) + if residual_is_none: + residual = None + else: + if add_back: + target_residual = x + x + residual = x + x_ = copy.deepcopy(x) + residual_ = x_ if not residual_is_none else None + ref_out = add_layernorm(residual_, x_, weight, None, eps, add_back) + ipex_out = ipex.llm.functional.add_layer_norm( + residual, x, weight, None, eps, add_back + ) + if not residual_is_none: + if add_back: + self.assertEqual(residual, target_residual) + self.assertEqual(residual_, target_residual) + else: + self.assertEqual(residual, x) + self.assertEqual(residual_, x) + self.assertEqual(ref_out, ipex_out) + + def test_add_rmsnorm(self): + for add_back in [True, False]: + for dtype in [torch.float, torch.bfloat16]: + for residual_is_none in [True, False]: + weight = torch.nn.Parameter(torch.randn(4096)).to(dtype) + eps = 1e-6 + x = torch.rand(1, 32, 4096).to(dtype) + if residual_is_none: + residual = None + else: + if add_back: + target_residual = x + x + residual = x + x_ = copy.deepcopy(x) + residual_ = x_ if not residual_is_none else None + ref_out = add_rmsnorm(residual_, x_, weight, None, eps, add_back) + ipex_out = ipex.llm.functional.add_rms_norm( + residual, x, weight, None, eps, add_back + ) + if not residual_is_none: + if add_back: + self.assertEqual(residual, target_residual) + self.assertEqual(residual_, target_residual) + else: + self.assertEqual(residual, x) + self.assertEqual(residual_, x) + self.assertEqual(ref_out, ipex_out) + + def test_gelu_mul(self): + for dtype in [torch.float, torch.bfloat16]: + for approximate in ["tanh", "none"]: + x = torch.rand(1, 32, 4096).to(dtype) + x_ = copy.deepcopy(x) + ref_out = gelu_mul(x_, x_, approximate=approximate) + ipex_out = ipex.llm.functional.gelu_mul(x_, x_, approximate=approximate) + self.assertEqual(ref_out, ipex_out) + + def test_silu_mul(self): + for dtype in [torch.float, torch.bfloat16]: + x = torch.rand(1, 32, 4096).to(dtype) + x_ = copy.deepcopy(x) + ref_out = silu_mul(x_, x_) + ipex_out = ipex.llm.functional.silu_mul(x_, x_) + self.assertEqual(ref_out, ipex_out) + if __name__ == "__main__": test = unittest.main() diff --git a/tests/cpu/test_tpp_linear.py b/tests/cpu/test_tpp_linear.py index 0f8d1f593..a121be8e7 100644 --- a/tests/cpu/test_tpp_linear.py +++ b/tests/cpu/test_tpp_linear.py @@ -102,6 +102,26 @@ def forward(self, x): class TestTPPlinear(TestCase): + def test_tpp_linear_fallback_flag(self): + x1 = torch.rand(1, 1, 4097) + x2 = copy.deepcopy(x1) + for dtype in [torch.float, torch.bfloat16]: + model = Linear_tpp_fallback_dnnl().eval() + + with torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if dtype is torch.bfloat16 else False + ): + ref_out = model(x1) + + model = ipex.optimize(model, dtype=dtype) + with torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if dtype is torch.bfloat16 else False + ): + model = torch.jit.script(model) + model = torch.jit.freeze(model) + out = model(x2) + self.assertEqual(out, ref_out) + def test_tpp_linear_fallback(self): x1 = torch.rand(1, 1, 4097) x2 = copy.deepcopy(x1) From 091a6a2e16cb48fd917b8b2d9d570bb0025f6f01 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Tue, 21 May 2024 07:13:05 +0800 Subject: [PATCH 07/21] Update dependency_version.yml 20240521 (#2908) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index b258442c5..7ffa00ac3 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240519+cpu + version: 2.4.0.dev20240520+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240519+cpu + version: 2.2.0.dev20240520+cpu torchvision: - version: 0.19.0.dev20240519+cpu + version: 0.19.0.dev20240520+cpu transformers: version: 4.38.1 From 0285cc50d20025c53b82de886bed53476f505a99 Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Tue, 21 May 2024 14:13:14 +0800 Subject: [PATCH 08/21] rel 2.3.0 backport to main (#2881) * r230 backport to main * add #2848 * minor format fix * update int8 recipe table & version numbers in serving example part * flake8 format correction --- README.md | 31 +- docker/Dockerfile.prebuilt | 11 +- docker/README.md | 2 +- docs/_static/htmls/tbl_deepspeed.html | 34 +- docs/_static/htmls/tbl_single.html | 117 ++++-- docs/tutorials/api_doc.rst | 63 +++ docs/tutorials/examples.md | 2 +- docs/tutorials/features/fast_bert.md | 2 +- .../features/sq_recipe_tuning_api.md | 2 +- docs/tutorials/installation.md | 4 +- docs/tutorials/introduction.rst | 2 +- docs/tutorials/llm.rst | 10 +- docs/tutorials/llm/llm_optimize.md | 4 +- docs/tutorials/releases.md | 69 ++++ examples/cpu/inference/cpp/README.md | 8 +- .../inference/python/llm-modeling/README.md | 180 +++++++-- .../cpu/inference/python/llm-modeling/run.py | 95 +++-- examples/cpu/inference/python/llm/README.md | 97 +++-- .../inference/python/llm/llm_sq_recipes.md | 3 +- examples/cpu/serving/torchserve/README.md | 8 +- examples/cpu/serving/triton/Dockerfile | 8 +- examples/cpu/serving/triton/requirements.txt | 8 +- .../llm/functional/fusions.py | 155 +++++--- .../llm/modules/linear_fusion.py | 126 +++++- .../llm/modules/mha_fusion.py | 367 ++++++++++-------- 25 files changed, 969 insertions(+), 439 deletions(-) diff --git a/README.md b/README.md index 992b97429..f2f90b7d6 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Intelยฎ Extension for PyTorch\* -**CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.2.0%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
+**CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.3.0%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
**GPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)
Intelยฎ Extension for PyTorch\* extends PyTorch\* with up-to-date features optimizations for an extra performance boost on Intel hardware. Optimizations take advantage of Intelยฎ Advanced Vector Extensions 512 (Intelยฎ AVX-512) Vector Neural Network Instructions (VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) on Intel CPUs as well as Intel Xe Matrix Extensions (XMX) AI engines on Intel discrete GPUs. Moreover, Intelยฎ Extension for PyTorch* provides easy GPU acceleration for Intel discrete GPUs through the PyTorch* xpu device. @@ -19,28 +19,35 @@ In the current technological landscape, Generative AI (GenAI) workloads and mode | MODEL FAMILY | MODEL NAME (Huggingface hub) | FP32 | BF16 | Static quantization INT8 | Weight only quantization INT8 | Weight only quantization INT4 | |:---:|:---:|:---:|:---:|:---:|:---:|:---:| |LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-8B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLAMA| meta-llama/Meta-Llama-3-70B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | |GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | |DOLLY| databricks/dolly-v2-12b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|FALCON| tiiuae/falcon-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | |FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |OPT| facebook/opt-30b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | |OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | |Bloom| bigscience/bloom-1b7 | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | +|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | -|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |Baichuan| baichuan-inc/Baichuan-13B-Chat | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | | |ChatGLM| THUDM/chatglm3-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |ChatGLM| THUDM/chatglm2-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | -|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | -|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | -|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | ๐ŸŸจ | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLaVA| liuhaotian/llava-v1.5-7b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Yuan| IEITYuan/Yuan2-102B-hf | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | +|Phi| microsoft/phi-2 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | - ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32). @@ -49,6 +56,10 @@ In the current technological landscape, Generative AI (GenAI) workloads and mode *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. +In addition, Intelยฎ Extension for PyTorch* introduces module level optimization APIs (prototype feature) since release 2.3.0. +The feature provides optimized alternatives for several commonly used LLM modules and functionalities for the optimizations of the niche or customized LLMs. +Please read [**LLM module level optimization practice**](./examples/cpu/inference/python/llm-modeling) to better understand how to optimize your own LLM and achieve better performance. + ## Support The team tracks bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-pytorch/issues/). Before submitting a suggestion or bug report, search the existing GitHub issues to see if your issue has already been reported. diff --git a/docker/Dockerfile.prebuilt b/docker/Dockerfile.prebuilt index c004228f5..e6561cc50 100644 --- a/docker/Dockerfile.prebuilt +++ b/docker/Dockerfile.prebuilt @@ -17,7 +17,8 @@ RUN apt-get update -y && \ apt-get upgrade -y && \ apt-get install -y --no-install-recommends --fix-missing \ ${PYTHON} \ - ${PYTHON}-pip + ${PYTHON}-pip \ + ${PYTHON}-dev RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \ pip \ @@ -27,10 +28,10 @@ RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \ # Some TF tools expect a "python" binary RUN ln -s $(which ${PYTHON}) /usr/local/bin/python -ARG IPEX_VERSION=2.2.0 -ARG PYTORCH_VERSION=2.2.0 -ARG TORCHAUDIO_VERSION=2.2.0 -ARG TORCHVISION_VERSION=0.17.0 +ARG IPEX_VERSION=2.3.0 +ARG PYTORCH_VERSION=2.3.0 +ARG TORCHAUDIO_VERSION=2.3.0 +ARG TORCHVISION_VERSION=0.18.0 ARG TORCH_CPU_URL=https://download.pytorch.org/whl/cpu/torch_stable.html RUN \ diff --git a/docker/README.md b/docker/README.md index 72147b211..b263f74a5 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,7 +2,7 @@ * Notes - If you use linux kernerl under version 5.4 in host, upgrade it. + If you use linux kernel under version 5.4 in host, upgrade it. * How to build an image diff --git a/docs/_static/htmls/tbl_deepspeed.html b/docs/_static/htmls/tbl_deepspeed.html index 2b0dd5bc7..20b94d8bd 100644 --- a/docs/_static/htmls/tbl_deepspeed.html +++ b/docs/_static/htmls/tbl_deepspeed.html @@ -26,6 +26,18 @@

๐ŸŸฉ

๐ŸŸฉ

+ +

LLAMA

+

meta-llama/Meta-Llama-3-8B

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

LLAMA

+

meta-llama/Meta-Llama-3-70B

+

๐ŸŸฉ

+

๐ŸŸฉ

+

GPT-J

EleutherAI/gpt-j-6b

@@ -83,7 +95,7 @@

Baichuan

baichuan-inc/Baichuan2-13B-Chat

-

๐ŸŸจ

+

๐ŸŸฉ

๐ŸŸฉ

@@ -116,9 +128,27 @@

๐ŸŸฉ

๐ŸŸฉ

+ +

Stablelm

+

stabilityai/stablelm-2-1_6b

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

Qwen

+

Qwen/Qwen-7B-Chat

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

GIT

+

microsoft/git-base

+

๐ŸŸฉ

+

๐ŸŸฉ

+
  • ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • ๐ŸŸจ signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -
+ \ No newline at end of file diff --git a/docs/_static/htmls/tbl_single.html b/docs/_static/htmls/tbl_single.html index 8eaefecf8..3d1aa537f 100644 --- a/docs/_static/htmls/tbl_single.html +++ b/docs/_static/htmls/tbl_single.html @@ -27,7 +27,7 @@

๐ŸŸฉ

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

LLAMA

@@ -36,6 +36,24 @@

๐ŸŸฉ

๐ŸŸฉ

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

LLAMA

+

meta-llama/Meta-Llama-3-8B

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+

๐ŸŸฉ

+

+ + +

LLAMA

+

meta-llama/Meta-Llama-3-70B

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+

๐ŸŸฉ

๐ŸŸจ

@@ -65,7 +83,16 @@

๐ŸŸฉ

๐ŸŸจ

- + +

FALCON

+

tiiuae/falcon-7b

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

+ +

FALCON

tiiuae/falcon-40b

๐ŸŸฉ

@@ -74,7 +101,7 @@

๐ŸŸฉ

๐ŸŸฉ

- +

OPT

facebook/opt-30b

๐ŸŸฉ

@@ -83,7 +110,7 @@

๐ŸŸฉ

๐ŸŸจ

- +

OPT

facebook/opt-1.3b

๐ŸŸฉ

@@ -92,7 +119,7 @@

๐ŸŸฉ

๐ŸŸจ

- +

Bloom

bigscience/bloom-1b7

๐ŸŸฉ

@@ -101,16 +128,16 @@

๐ŸŸฉ

๐ŸŸจ

- +

CodeGen

Salesforce/codegen-2B-multi

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

๐ŸŸฉ

๐ŸŸฉ

- +

Baichuan

baichuan-inc/Baichuan2-7B-Chat

๐ŸŸฉ

@@ -119,16 +146,16 @@

๐ŸŸฉ

- +

Baichuan

baichuan-inc/Baichuan2-13B-Chat

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸฉ

+

๐ŸŸจ

๐ŸŸฉ

- +

Baichuan

baichuan-inc/Baichuan-13B-Chat

๐ŸŸฉ

@@ -137,7 +164,7 @@

๐ŸŸฉ

- +

ChatGLM

THUDM/chatglm3-6b

๐ŸŸฉ

@@ -146,7 +173,7 @@

๐ŸŸฉ

- +

ChatGLM

THUDM/chatglm2-6b

๐ŸŸฉ

@@ -155,7 +182,7 @@

๐ŸŸฉ

- +

GPTBigCode

bigcode/starcoder

๐ŸŸฉ

@@ -164,15 +191,24 @@

๐ŸŸฉ

๐ŸŸจ

- +

T5

google/flan-t5-xl

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

+ +

MPT

+

mosaicml/mpt-7b

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

Mistral

mistralai/Mistral-7B-v0.1

@@ -183,17 +219,35 @@

๐ŸŸจ

-

MPT

-

mosaicml/mpt-7b

+

Mixtral

+

mistralai/Mixtral-8x7B-v0.1

๐ŸŸฉ

๐ŸŸฉ

+

+

๐ŸŸฉ

๐ŸŸจ

+ + +

Stablelm

+

stabilityai/stablelm-2-1_6b

+

๐ŸŸฉ

๐ŸŸฉ

+

๐ŸŸจ

๐ŸŸฉ

+

๐ŸŸจ

+ + +

Qwen

+

Qwen/Qwen-7B-Chat

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+

๐ŸŸฉ

+

-

Mixtral

-

mistralai/Mixtral-8x7B-v0.1

+

LLaVA

+

liuhaotian/llava-v1.5-7b

๐ŸŸฉ

๐ŸŸฉ

@@ -201,26 +255,35 @@

-

Stablelm

-

stabilityai/stablelm-2-1_6b

+

GIT

+

microsoft/git-base

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

-

Qwen

-

Qwen/Qwen-7B-Chat

+

Yuan

+

IEITYuan/Yuan2-102B-hf

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸฉ

+

๐ŸŸจ

+ +

Phi

+

microsoft/phi-2

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+
  • ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • ๐ŸŸจ signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -
+ \ No newline at end of file diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst index 7080252eb..1a161c0a3 100644 --- a/docs/tutorials/api_doc.rst +++ b/docs/tutorials/api_doc.rst @@ -18,7 +18,70 @@ General .. currentmodule:: intel_extension_for_pytorch .. autoclass:: verbose +LLM Module Level Optimizations (Prototype) +****************************************** +Module level optimization APIs are provided for optimizing customized LLMs. + +.. automodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearSilu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearSiluMul + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: Linear2SiluMul + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearRelu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearNewGelu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearGelu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearMul + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearAdd + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearAddAdd + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: RotaryEmbedding + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: RMSNorm + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: FastLayerNorm + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: IndirectAccessKVCacheAttention + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: PagedAttention + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: VarlenAttention + +.. automodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: rotary_embedding + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: rms_norm + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: fast_layer_norm + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: indirect_access_kv_cache_attention + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: varlen_attention Fast Bert (Prototype) ************************ diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md index f8bdd5982..f90505a2d 100644 --- a/docs/tutorials/examples.md +++ b/docs/tutorials/examples.md @@ -359,5 +359,5 @@ $ ldd example-app ## Intelยฎ AI Reference Models -Use cases that have already been optimized by Intel engineers are available at [Intelยฎ AI Reference Models](https://github.com/IntelAI/models/tree/pytorch-r2.2.0-models) (former Model Zoo). A number of PyTorch use cases for benchmarking are also available in the [benchmarks](https://github.com/IntelAI/models/tree/pytorch-r2.2.0-models/benchmarks#pytorch-use-cases). You can get performance benefits out-of-the-box by simply running scripts in the Intelยฎ AI Reference Models. +Use cases that have already been optimized by Intel engineers are available at [Intelยฎ AI Reference Models](https://github.com/IntelAI/models/tree/pytorch-r2.3-models) (former Model Zoo). A number of PyTorch use cases for benchmarking are also available in the [benchmarks](https://github.com/IntelAI/models/tree/pytorch-r2.3-models/benchmarks#pytorch-use-cases). You can get performance benefits out-of-the-box by simply running scripts in the Intelยฎ AI Reference Models. diff --git a/docs/tutorials/features/fast_bert.md b/docs/tutorials/features/fast_bert.md index 12725f6b6..a16862e6f 100644 --- a/docs/tutorials/features/fast_bert.md +++ b/docs/tutorials/features/fast_bert.md @@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc ### Prerequisite -- Transformers 4.6.0 ~ 4.31.0 +- Transformers 4.6.0 ~ 4.38.1 ### Usage Example diff --git a/docs/tutorials/features/sq_recipe_tuning_api.md b/docs/tutorials/features/sq_recipe_tuning_api.md index 6928a063c..0ef8e0918 100644 --- a/docs/tutorials/features/sq_recipe_tuning_api.md +++ b/docs/tutorials/features/sq_recipe_tuning_api.md @@ -15,6 +15,6 @@ SmoothQuant will introduce alpha to calculate the ratio of input and weight upda | shared_criterion | "mean" | ["min", "mean","max"] | criterion for input LayerNorm op of a transformer block. | | enable_blockwise_loss | False | [True, False] | whether to enable block-wise auto-tuning | -For LLM examples, please refer to [example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm). +For LLM examples, please refer to [example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm). **Note**: When defining dataloaders for calibration, please follow INC's dataloader [format](https://github.com/intel/neural-compressor/blob/master/docs/source/dataloader.md). diff --git a/docs/tutorials/installation.md b/docs/tutorials/installation.md index 1383ed7a8..b42f81066 100644 --- a/docs/tutorials/installation.md +++ b/docs/tutorials/installation.md @@ -1,8 +1,8 @@ Installation ============ -Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.2.0%2Bcpu). +Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.3.0%2Bcpu). After successful installation, refer to the [Quick Start](getting_started.md) and [Examples](examples.md) sections to start using the extension in your code. -**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm). +**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm). diff --git a/docs/tutorials/introduction.rst b/docs/tutorials/introduction.rst index 3edf7f1c4..0f0439dd1 100644 --- a/docs/tutorials/introduction.rst +++ b/docs/tutorials/introduction.rst @@ -16,7 +16,7 @@ the `Large Language Models (LLM) `_ section. Get Started ----------- -- `Installation <../../../index.html#installation?platform=cpu&version=v2.2.0%2Bcpu>`_ +- `Installation <../../../index.html#installation?platform=cpu&version=v2.3.0%2Bcpu>`_ - `Quick Start `_ - `Examples `_ diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst index 72eb62c2a..e1e117e5d 100644 --- a/docs/tutorials/llm.rst +++ b/docs/tutorials/llm.rst @@ -30,8 +30,14 @@ Verified for distributed inference mode via DeepSpeed *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. -Please check `LLM best known practice <../../examples/cpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts. +Please check `LLM best known practice `_ for instructions to install/setup environment and example scripts. +Module Level Optimization API for customized LLM (Prototype) +------------------------------------------------------------ + +In the past year, LLM has been flourishing with many open-sourced models contributed to the community, while researchers are building their own LLMs from transformer blocks with variants in implementation details. To help LLM researchers and developers improve their productivity, Intelยฎ Extension for PyTorch* provides module level optimizations for commonly used LLM modules and functionalities, which are operators or certain operator combinations in nature. + +Please check `LLM module level optimization practice `_ to better understand how to use `module level APIs `_ to optimize your LLM and achieve better performance. Demos ----- @@ -143,4 +149,4 @@ Operators fusion is generally used to enable sub-graph fusion to reduce the memo Distributed Inference ~~~~~~~~~~~~~~~~~~~~~ -All above optimizations already help you to get very good performance with single instance. To furthly reduce the inference latency and improve throughput, tensor parallel is also enabled in our soluction. You can firstly use DeepSpeed to auto shard the model and then apply above optimizations with the frontend API function provided by Intelยฎ Extension for PyTorch. +All above optimizations already help you to get very good performance with single instance. To further reduce the inference latency and improve throughput, tensor parallel is also enabled in our solution. You can firstly use DeepSpeed to auto shard the model and then apply above optimizations with the frontend API function provided by Intelยฎ Extension for PyTorch. diff --git a/docs/tutorials/llm/llm_optimize.md b/docs/tutorials/llm/llm_optimize.md index efc4278c2..44203cc80 100644 --- a/docs/tutorials/llm/llm_optimize.md +++ b/docs/tutorials/llm/llm_optimize.md @@ -9,7 +9,7 @@ API documentation is available at [API Docs page](https://intel.github.io/intel- ## Pseudocode of Common Usage Scenarios -The following sections show pseudocode snippets to invoke Intelยฎ Extension for PyTorch\* APIs to work with LLM models. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm). +The following sections show pseudocode snippets to invoke Intelยฎ Extension for PyTorch\* APIs to work with LLM models. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm). ### FP32/BF16 @@ -98,7 +98,7 @@ model = ipex.llm.optimize(model, quantization_config=qconfig, low_precision_chec Distributed inference can be performed with `DeepSpeed`. Based on original Intelยฎ Extension for PyTorch\* scripts, the following code changes are required. -Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm/distributed) for complete codes. +Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm/distributed) for complete codes. ``` python import torch diff --git a/docs/tutorials/releases.md b/docs/tutorials/releases.md index a7456b000..974774be5 100644 --- a/docs/tutorials/releases.md +++ b/docs/tutorials/releases.md @@ -1,6 +1,75 @@ Releases ============= +## 2.3.0 + +We are excited to announce the release of Intelยฎ Extension for PyTorch* 2.3.0+cpu which accompanies PyTorch 2.3. This release mainly brings you the new feature on Large Language Model (LLM) called module level LLM optimization API, which provides module level optimizations for commonly used LLM modules and functionalities, and targets to optimize customized LLM modeling for scenarios like private models, self-customized models, LLM serving frameworks, etc. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product. + +### Highlights + +- Large Language Model (LLM) optimization + + [Intelยฎ Extension for PyTorch*](https://github.com/intel/intel-extension-for-pytorch) provides a new feature called module level LLM optimization API, which provides module level optimizations for commonly used LLM modules and functionalities. LLM creators can then use this new API set to replace related parts in models by themselves, with which to reach peak performance. + + There are 3 categories of module level LLM optimization APIs in general: + + - Linear post-op APIs + + ```python + # using module init and forward + ipex.llm.modules.linearMul + ipex.llm.modules.linearGelu + ipex.llm.modules.linearNewGelu + ipex.llm.modules.linearAdd + ipex.llm.modules.linearAddAdd + ipex.llm.modules.linearSilu + ipex.llm.modules.linearSiluMul + ipex.llm.modules.linear2SiluMul + ipex.llm.modules.linearRelu + ``` + + - Attention related APIs + + ```python + # using module init and forward + ipex.llm.modules.RotaryEmbedding + ipex.llm.modules.RMSNorm + ipex.llm.modules.FastLayerNorm + ipex.llm.modules.VarlenAttention + ipex.llm.modules.PagedAttention + ipex.llm.modules.IndirectAccessKVCacheAttention + + # using as functions + ipex.llm.functional.rotary_embedding + ipex.llm.functional.rms_norm + ipex.llm.functional.fast_layer_norm + ipex.llm.functional.indirect_access_kv_cache_attention + ipex.llm.functional.varlen_attention + ``` + + - Generation related APIs + + ```python + # using for optimizing huggingface generation APIs with prompt sharing + ipex.llm.generation.hf_beam_sample + ipex.llm.generation.hf_beam_search + ipex.llm.generation.hf_greedy_search + ipex.llm.generation.hf_sample + ``` + + More detailed introduction on how to apply this API set and example code walking you through can be found [here](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.3/examples/cpu/inference/python/llm-modeling). + +- Bug fixing and other optimization + + - Optimized the performance of LLM [#2561](https://github.com/intel/intel-extension-for-pytorch/commit/ade45387ecc4e707754de9db6fc2be0af186e2ba) [#2584](https://github.com/intel/intel-extension-for-pytorch/commit/05d07645e1ae5eeeff15abda31a6ba5806dd2bb2) [#2617](https://github.com/intel/intel-extension-for-pytorch/commit/adb563834a4f6bd327d7307c493c8fe1648e6211) [#2663](https://github.com/intel/intel-extension-for-pytorch/commit/214dea0c8e7b2864a0c2d1a1c32fb7815ca68070) [#2733](https://github.com/intel/intel-extension-for-pytorch/commit/f5b941c3b7ea8fe1a387617a9329467d1e1b544a) + - Supported Act Order of GPTQ [#2550](https://github.com/intel/intel-extension-for-pytorch/commit/be636289eef628b995e79a475c58f8a4d93e4890) [#2568](https://github.com/intel/intel-extension-for-pytorch/commit/9fcc4897492333330fb6bd156b1178d55347d292) + - Improved the warning and the logging information for better user experience [#2641](https://github.com/intel/intel-extension-for-pytorch/commit/e0bf673cf3ea4063a7e168ec221f421fbd378fb3) [#2675](https://github.com/intel/intel-extension-for-pytorch/commit/770275a755ea0445675720a3f6f14e77c491fceb) + - Added TorchServe CPU Example [#2613](https://github.com/intel/intel-extension-for-pytorch/commit/1f6fe6423dde7ccecc1565e73dc81d9cb281bc1f) + - Upgraded oneDNN to v3.4.1 [#2747](https://github.com/intel/intel-extension-for-pytorch/commit/e2a9af49874fcf39097036c08848cd37cadc0084) + - Misc fix and enhancement [#2468](https://github.com/intel/intel-extension-for-pytorch/commit/f88a7d127a6a3017db508454c7d332d7b2ad83f6) [#2627](https://github.com/intel/intel-extension-for-pytorch/commit/bc32ea463084d711e4a9aae85e38dd5d7d427849) [#2631](https://github.com/intel/intel-extension-for-pytorch/commit/f55a2bfa5d505fb7c7a6225c1c6206b5926777ab) [#2704](https://github.com/intel/intel-extension-for-pytorch/commit/eae477f76356b5a83640941787a168f680334775) + +**Full Changelog**: https://github.com/intel/intel-extension-for-pytorch/compare/v2.2.0+cpu...v2.3.0+cpu + ## 2.2.0 We are excited to announce the release of Intelยฎ Extension for PyTorch\* 2.2.0+cpu which accompanies PyTorch 2.2. This release mainly brings in our latest optimization on Large Language Model (LLM) including new dedicated API set (`ipex.llm`), new capability for auto-tuning accuracy recipe for LLM, and a broader list of optimized LLM models, together with a set of bug fixing and small optimization. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product. diff --git a/examples/cpu/inference/cpp/README.md b/examples/cpu/inference/cpp/README.md index 49afbdd80..e4d30b2b1 100644 --- a/examples/cpu/inference/cpp/README.md +++ b/examples/cpu/inference/cpp/README.md @@ -16,15 +16,15 @@ We can have `libtorch` and `libintel-ext-pt` installed via the following command Download zip file of `libtorch` and decompress it: ```bash -wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.2.0%2Bcpu.zip -unzip libtorch-cxx11-abi-shared-with-deps-2.2.0+cpu.zip +wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.3.0%2Bcpu.zip +unzip libtorch-cxx11-abi-shared-with-deps-2.3.0+cpu.zip ``` Download and execute `libintel-ext-pt` installation script: ```bash -wget https://intel-extension-for-pytorch.s3.amazonaws.com/libipex/cpu/libintel-ext-pt-cxx11-abi-2.2.0%2Bcpu.run -bash libintel-ext-pt-cxx11-abi-2.2.0+cpu.run install ./libtorch +wget https://intel-extension-for-pytorch.s3.amazonaws.com/libipex/cpu/libintel-ext-pt-cxx11-abi-2.3.0%2Bcpu.run +bash libintel-ext-pt-cxx11-abi-2.3.0+cpu.run install ./libtorch ``` *Note:* If your C++ project has pre-C\+\+11 library dependencies, diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md index 694b7b605..196843409 100644 --- a/examples/cpu/inference/python/llm-modeling/README.md +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -1,15 +1,16 @@ -# 1. LLM Optimization Overview +๏ปฟ# 1. LLM Module Level Optimizations Overview -ipex.llm provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. -To further provide optimized modules or functions to help build modelings, ipex supports the following module/function level APIs: +Intelยฎ Extension for PyTorch* provides dedicated optimization for running Large Language Models (LLMs) faster, including technical points like paged attention, ROPE fusion, etc. +To further provide optimized modules or functions to help build modelings, `ipex.llm` supports the following module/function level APIs: -``` +```python import intel_extension_for_pytorch as ipex ``` -### linear post-op fusions -``` -#using module init and forward +## Linear post-op fusions + +```python +# using module init and forward ipex.llm.modules.linearMul ipex.llm.modules.linearGelu ipex.llm.modules.linearNewGelu @@ -21,9 +22,10 @@ ipex.llm.modules.linear2SiluMul ipex.llm.modules.linearRelu ``` -### Attention related fusions -``` -#using module init and forward +## Attention related fusions + +```python +# using module init and forward ipex.llm.modules.RotaryEmbedding ipex.llm.modules.RMSNorm ipex.llm.modules.FastLayerNorm @@ -31,7 +33,7 @@ ipex.llm.modules.VarlenAttention ipex.llm.modules.PagedAttention ipex.llm.modules.IndirectAccessKVCacheAttention -#using as functions +# using as functions ipex.llm.functional.rotary_embedding ipex.llm.functional.rms_norm ipex.llm.functional.fast_layer_norm @@ -43,8 +45,11 @@ ipex.llm.functional.silu_mul ipex.llm.functional.gelu_mul ``` -### Generation related fusions -``` +## Generation related fusions + +```python +# using for optimizing huggingface generation APIs with prompt sharing +ipex.llm.generation.hf_beam_sample ipex.llm.generation.hf_beam_search ipex.llm.generation.hf_greedy_search ipex.llm.generation.hf_sample @@ -52,39 +57,21 @@ ipex.llm.generation.hf_sample
-# 2. Show cases of ipex.llm optimized modules and functions based modeling -We provide LLAMA, GPTJ and OPT modeling as show cases that apply the optimized modules or functions from ipex.llm layers. - -| MODEL FAMILY | MODEL NAME (Huggingface hub) | -|:---:|:---:| -|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", etc. | -|GPT-J| "EleutherAI/gpt-j-6b", etc. | -|OPT| "facebook/opt-30b", "facebook/opt-1.3b", etc. | - -## How To Run LLM with ipex.llm +# 2. Showcases of ipex.llm optimized modules and functions based modeling -**ipex.llm provides a single script to facilitate running generation tasks as below:** -Note that please setup ENV according to the ../llm/README.md +We provide optimized LLAMA, GPT-J and OPT modeling files on the basis of [huggingface modeling APIs](https://huggingface.co/docs/transformers/en/main_classes/model) and a entry script `run.py` as showcases that apply the optimized modules or functions from `ipex.llm`. -``` -python run.py --help # for more detailed usages -``` +## Running example script -| Key args of run.py | Notes | -|---|---| -| model name | use "-m MODEL_NAME" to choose models to run | -| generation | default: beam search (beam size = 4), "--greedy" for greedy search | -| input tokens | default: 32, provide fixed sizes for input prompt size, use "--input-tokens" for [32, 64, 128, 256, 512, 1024, 2016, 2017, 2048, 4096, 8192]; if "--input-tokens" is not used, use "--prompt" to choose other strings as inputs| -| output tokens | default: 32, use "--max-new-tokens" to choose any other size | -| batch size | default: 1, use "--batch-size" to choose any other size | -| generation iterations | use "--num-iter" and "--num-warmup" to control the repeated iterations of generation, default: 100-iter/10-warmup | -| ipex prepack | apply ipex weight prepack optimization by "--use-ipex-optimize"| -| profiling | enable pytorch profiling by " --profile"| +Please refer to the [instructions](../llm/README.md#3-environment-setup) for environment setup. -*Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login). +The detail usage of `run.py` can be obtained by running +```bash +python run.py --help +``` -## Run commands +Example commands are listed below: ```bash # The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that @@ -92,9 +79,118 @@ python run.py --help # for more detailed usages # Please adjust the settings per your hardware. # Running FP32 model -OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype float32 --use-ipex-optimize +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype float32 # Running BF16 model -OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --use-ipex-optimize +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype bfloat16 +``` + +*Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login). + +
+ +# 3. Optimize your own LLM with ipex.llm + +## Changes required in the modeling file + +The changes required for applying `ipex.llm` optimizations for the customized LLMs are highly diverse based on their respective model architectures and implementations. +Generally speaking, the key steps would be: + +1. Analyze the model to find out the parts that are suitable for utilizing the optimizations. +2. Re-write these parts, applying the optimized `ipex.llm` operators. + +3. Some refactor of model architecture definition may be required to connect the original and optimized modules. + +## Changes required in the inference entry script + +Some key updates are required in the LLM inference entry script: + +1. Optimization for linear modules and their fusions: realized by weight prepacking with `ipex.optimize()`. + +```python +from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( + _enable_tpp, + _disable_tpp, +) + +_disable_tpp() +if args.dtype == "bfloat16": + _enable_tpp() + model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) +else: + model = ipex.optimize( + model.eval(), + dtype=torch.float32, + inplace=True, + auto_kernel_selection=True, + ) +``` + +*Note:* The example is for FP32/BF16 optimization. +Please refer to [Advanced Usage](#4-advanced-usage) part for weight only quantization enabling. + +2. Optimizations for [the huggingface text generation API](https://huggingface.co/docs/transformers/en/main_classes/text_generation): + +- Using `ipex.llm.generation` functions to get prompt sharing for first token acceleration when `num_beams > 1`. + +```python +# Taking beam search as example here, please check complete code updates in run.py +hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) +setattr(model, "beam_search", hf_beam_search) ``` + +- Using PyTorch jit to further reduce dispatch overhead for first token and next tokens acceleration. + +```python +# Please create a dummy `sample_inputs` in advance +# as the example input for jit.trace() +with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + trace_model = torch.jit.trace( + model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + trace_model = torch.jit.freeze(trace_model) + model = ipex._set_optimized_model_for_generation( + model, optimized_model=trace_model + ) +``` + +Please read `run.py` and the example modeling files for detail of the changes. +The key parts are highlighted with comments. + +
+ +# 4. Advanced usage + +## How to apply weight only quantization int8 + +Intelยฎ Extension for PyTorch* also provides weight only quantization for int8 precision optimization +(replace the part using `ipex.optimize()`, which is for fp32/bf16 optimization in above showcases). + +```python +from intel_extension_for_pytorch.quantization import WoqWeightDtype +from intel_extension_for_pytorch.quantization import prepare, convert +weight_dtype = WoqWeightDtype.INT8 # weight dtype is int8 +lowp_mode = ipex.quantization.WoqLowpMode.BF16 # lowest precision for computation +qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + group_size= -1, # default is -1, can be further tuned in [32, 64, 128, 256, 512] (recommend) for better accuracy if needed +) +prepared_model = prepare(model, qconfig) +with torch.no_grad(), torch.cpu.amp.autocast(enabled=True): # we recommend to use quantization with AMP for better perf + converted_model = convert(prepared_model).to(torch.bfloat16) +``` + +
+ +# 5. Miscellaneous Tips + +- For LLMs, usually the query, key and value linear operations in Attention layer can be fused into one linear as kind of concat linear optimization. (e.g., [modeling_gpt_neox](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#L175) from transformers) + +- LLM generation tasks are based on the [assumption](https://huggingface.co/blog/how-to-generate) that the probability distribution of a word sequence can be decomposed into the product of conditional next word distributions. +Thus the model's computation of `lm_head` layer during the first token's generation can be reduced with using last token as its inputs (instead of using the full tokens from input prompt). +The showcases we provide contain such optimization (set with `lm_head_generation` flag). This is also optimized in LLM serving [text-generation-inference](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py#L419). \ No newline at end of file diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py index 6a34ca845..d8f73ee60 100644 --- a/examples/cpu/inference/python/llm-modeling/run.py +++ b/examples/cpu/inference/python/llm-modeling/run.py @@ -117,7 +117,6 @@ def get_dummy_input(_model, return_dict=False): ) parser.add_argument("--greedy", action="store_true") parser.add_argument("--profile", action="store_true") -parser.add_argument("--use-ipex-optimize", action="store_true") parser.add_argument("--token-latency", action="store_true") parser.add_argument("--num-iter", default=100, type=int, help="num iter") parser.add_argument("--num-warmup", default=10, type=int, help="num warmup") @@ -138,7 +137,6 @@ def get_dummy_input(_model, return_dict=False): torch_dtype=amp_dtype, low_cpu_mem_usage=True, attn_implementation="eager", - # torchscript=True if args.use_ipex_optimize else False, ) tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) @@ -153,56 +151,55 @@ def get_dummy_input(_model, return_dict=False): model = model.eval() -if args.use_ipex_optimize: - if not hasattr(model.config, "use_ipex_optimize"): - model.config.use_ipex_optimize = True - # 1) using ipex weight prepack to work with IPEX linear module and their fusions - from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( - _enable_tpp, - _disable_tpp, - ) +# Adding this attribute in model.config +# as it will be used in the modeling file. +if not hasattr(model.config, "use_ipex_optimize"): + model.config.use_ipex_optimize = True +# 1) Applying IPEX weight prepacking with `ipex.optimize()` +# to accelerate linear modules and their fusions. +from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( + _enable_tpp, + _disable_tpp, +) - _disable_tpp() - if args.dtype == "bfloat16": - _enable_tpp() - model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) - else: - model = ipex.optimize( - model.eval(), - dtype=torch.float32, - inplace=True, - auto_kernel_selection=True, - ) +_disable_tpp() +if args.dtype == "bfloat16": + _enable_tpp() + model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) +else: + model = ipex.optimize( + model.eval(), + dtype=torch.float32, + inplace=True, + auto_kernel_selection=True, + ) - # 2) using ipex geneartion function to get prompt sharing and first token optimizations - hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) - hf_greedy_search = ipex.llm.generation.hf_greedy_search.__get__( - model, model.__class__ +# 2) using `ipex.llm.generation` functions +# to get prompt sharing for first token optimization +hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) +hf_greedy_search = ipex.llm.generation.hf_greedy_search.__get__(model, model.__class__) +hf_sample = ipex.llm.generation.hf_sample.__get__(model, model.__class__) +hf_beam_sample = ipex.llm.generation.hf_beam_sample.__get__(model, model.__class__) + +setattr(model, "beam_search", hf_beam_search) # noqa: B010 +setattr(model, "greedy_search", hf_greedy_search) # noqa: B010 +setattr(model, "sample", hf_sample) # noqa: B010 +setattr(model, "beam_sample", hf_beam_sample) # noqa: B010 + +if not hasattr(model.config, "lm_head_generation"): + model.config.lm_head_generation = True + +# 3) using PyTorch jit to further reduce dispatch overhead +sample_inputs = get_dummy_input(model, return_dict=True) +with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + trace_model = torch.jit.trace( + model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, ) - hf_sample = ipex.llm.generation.hf_sample.__get__(model, model.__class__) - hf_beam_sample = ipex.llm.generation.hf_beam_sample.__get__(model, model.__class__) - - setattr(model, "beam_search", hf_beam_search) # noqa: B010 - setattr(model, "greedy_search", hf_greedy_search) # noqa: B010 - setattr(model, "sample", hf_sample) # noqa: B010 - setattr(model, "beam_sample", hf_beam_sample) # noqa: B010 - - if not hasattr(model.config, "lm_head_generation"): - model.config.lm_head_generation = True - - # 3) using PyTorch jit to further reduce dispatch overhead - sample_inputs = get_dummy_input(model, return_dict=True) - with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): - trace_model = torch.jit.trace( - model, - example_kwarg_inputs=sample_inputs, - strict=False, - check_trace=False, - ) - trace_model = torch.jit.freeze(trace_model) - model = ipex._set_optimized_model_for_generation( - model, optimized_model=trace_model - ) + trace_model = torch.jit.freeze(trace_model) + model = ipex._set_optimized_model_for_generation(model, optimized_model=trace_model) if ( diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index c6ac2beb7..7123f243e 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -1,6 +1,7 @@ # 1. LLM Optimization Overview -ipex.llm provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype). +`ipex.llm` provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. +And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype).
@@ -10,29 +11,36 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) | MODEL FAMILY | MODEL NAME (Huggingface hub) | FP32 | BF16 | Static quantization INT8 | Weight only quantization INT8 | Weight only quantization INT4 | |:---:|:---:|:---:|:---:|:---:|:---:|:---:| -|LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | -|GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|DOLLY| databricks/dolly-v2-12b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | -|OPT| facebook/opt-30b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-8B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLAMA| meta-llama/Meta-Llama-3-70B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|DOLLY| databricks/dolly-v2-12b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|FALCON| tiiuae/falcon-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|OPT| facebook/opt-30b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | |Bloom| bigscience/bloom-1b7 | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | -|Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | -|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |Baichuan| baichuan-inc/Baichuan-13B-Chat | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | | -|ChatGLM| THUDM/chatglm3-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | -|ChatGLM| THUDM/chatglm2-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | -|GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | -|Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | -|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | -|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | -|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|ChatGLM| THUDM/chatglm3-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|ChatGLM| THUDM/chatglm2-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | ๐ŸŸจ | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLaVA| liuhaotian/llava-v1.5-7b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Yuan| IEITYuan/Yuan2-102B-hf | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | +|Phi| microsoft/phi-2 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ## 2.2 Verified for distributed inference mode via DeepSpeed @@ -41,6 +49,8 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) |LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | |LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | |LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-8B | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-70B | ๐ŸŸฉ | ๐ŸŸฉ | |GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸจ | ๐ŸŸฉ | |GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸจ | ๐ŸŸฉ | |DOLLY| databricks/dolly-v2-12b | ๐ŸŸจ | ๐ŸŸฉ | @@ -50,12 +60,15 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) |Bloom| bigscience/bloom-1b7 | ๐ŸŸจ | ๐ŸŸฉ | |CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | |Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | -|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸจ | ๐ŸŸฉ | +|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | |Baichuan| baichuan-inc/Baichuan-13B-Chat | ๐ŸŸจ | ๐ŸŸฉ | |GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | |T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | |Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | |MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | +|GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | - ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32). @@ -69,8 +82,7 @@ We are working in progress to better support the models in the tables with vario # 3. Environment Setup *Note*: The instructions in this section will setup an environment with a recent PyTorch\* nightly build and **a latest source build of IPEX**. -If you would like to use stable PyTorch\* and IPEX release versions, please refer to the instructions [in the release branch](https://github.com/intel/intel-extension-for-pytorch/blob/v2.2.0%2Bcpu/examples/cpu/inference/python/llm/README.md#3-environment-setup), in which IPEX is installed via prebuilt wheels using `pip install` rather than source code building. - +If you would like to use stable PyTorch\* and IPEX release versions, please refer to the instructions [in the release branch](https://github.com/intel/intel-extension-for-pytorch/blob/v2.3.0%2Bcpu/examples/cpu/inference/python/llm/README.md#3-environment-setup), in which IPEX is installed via prebuilt wheels using `pip install` rather than source code building. ## 3.1 [Recommended] Docker-based environment setup with compilation from source @@ -119,6 +131,21 @@ source ./tools/env_activate.sh
+*Note*: In `env_setup.sh` script a `prompt.json` file is downloaded, which provides prompt samples with pre-defined input token lengths for benchmarking. +For **Llama-3 models** benchmarking, the users need to download a specific `prompt.json` file, overwriting the original one. + +```bash +wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json +``` + +The original `prompt.json` file can be restored from the repository if needed. + +```bash +wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json +``` + +
+ # 4. How To Run LLM with ipex.llm **ipex.llm provides a single script to facilitate running generation tasks as below:** @@ -127,7 +154,6 @@ source ./tools/env_activate.sh python run.py --help # for more detailed usages ``` - | Key args of run.py | Notes | |---|---| | generation | default: beam search (beam size = 4), "--greedy" for greedy search | @@ -229,16 +255,15 @@ cd distributed unset KMP_AFFINITY # Distributed inference in FP32 -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai # Distributed inference in BF16 -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai # Distributed inference with Weight-Only Quantization -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai ``` - ## 4.2 Detail usage of running LLM models ### 4.2.1 Run generation with one instance @@ -622,28 +647,28 @@ unset KMP_AFFINITY - Command: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype float32 --ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype float32 --ipex --tasks ``` - An example of llama2 7b model: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai ``` #### 5.2.2.3 BF16: - Command: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype bfloat16 -ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype bfloat16 -ipex --tasks ``` - An example of llama2 7b model: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai ``` #### 5.2.2.4 Weight-only quantization (INT8): - Command: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks ``` Similar to script usage for performance benchmarking, we need to update some arguments of the running command specifically for some models to achieve better accuracy. @@ -661,7 +686,7 @@ Similar to script usage for performance benchmarking, we need to update some arg - An example of llama2 7b model: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks ``` ## 5.3 How to Shard model for Distributed tests with DeepSpeed (autoTP) diff --git a/examples/cpu/inference/python/llm/llm_sq_recipes.md b/examples/cpu/inference/python/llm/llm_sq_recipes.md index 009938e07..22df336e2 100644 --- a/examples/cpu/inference/python/llm/llm_sq_recipes.md +++ b/examples/cpu/inference/python/llm/llm_sq_recipes.md @@ -12,9 +12,10 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama ## Example command for model tuning with AutoTune API | Model ID | Command | |---|:---:| -| meta-llama/Llama-2-7b-hf | python run.py -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --batch-size 56 --calib-len 2048 --fallback-add --alpha auto --init-alpha 0.8 --alpha-min 0.8 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'mean' | +| meta-llama/Llama-2-13b-hf | python run.py -m meta-llama/Llama-2-13b-hf --ipex-smooth-quant --alpha auto --init-alpha 0.8 --alpha-min 0.75 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'max' --calib-len 1024 --calib-padding --fallback-add | | meta-llama/Llama-2-70b-hf | python run.py -m meta-llama/Llama-2-70b-hf --ipex-smooth-quant --batch-size 56 --calib-shuffle --fallback-add --alpha 0.8 | | EleutherAI/gpt-j-6b | python run.py -m EleutherAI/gpt-j-6b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --fallback-add --alpha 0.85 | +| tiiuae/falcon-7b | python run.py -m tiiuae/falcon-7b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 | | tiiuae/falcon-40b | python run.py -m tiiuae/falcon-40b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.9 | | facebook/opt-30b | python run.py -m facebook/opt-30b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle | | facebook/opt-1.3b | python run.py -m facebook/opt-1.3b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.85 | diff --git a/examples/cpu/serving/torchserve/README.md b/examples/cpu/serving/torchserve/README.md index e18f302fd..8a9b30464 100644 --- a/examples/cpu/serving/torchserve/README.md +++ b/examples/cpu/serving/torchserve/README.md @@ -14,7 +14,7 @@ docker run \ --rm -it -u root \ --entrypoint='' \ -v $PWD:/home/model-server \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu \ + intel/intel-optimized-pytorch:2.3.0-serving-cpu \ python quantize_model.py ``` @@ -31,7 +31,7 @@ docker run \ --rm -it -u root \ --entrypoint='' \ -v $PWD:/home/model-server \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu \ + intel/intel-optimized-pytorch:2.3.0-serving-cpu \ torch-model-archiver \ --model-name ipex-resnet50 \ --version 1.0 \ @@ -43,7 +43,7 @@ docker run \ > [!NOTE] > If you are working under a corporate proxy you will need to include the following parameters in your `docker run` command: `-e http_proxy=${http_proxy} -e https_proxy=${https_proxy}`. -#### Advanced Model Archival +### Advanced Model Archival The `--handler` argument is an important component of serving as it controls the inference pipeline. Torchserve provides several default handlers [built into the application](https://pytorch.org/serve/default_handlers.html#torchserve-default-inference-handlers). that are often enough for most inference cases, but you may need to create a custom handler if your application's inference needs additional preprocessing, postprocessing or using other variables to derive a final output. To create a custom handler, first inherit `BaseHandler` or another built-in handler and override any necessary functionality. Usually, you only need to override the preprocessing and postprocessing methods to achieve an application's inference needs. @@ -88,7 +88,7 @@ docker run \ -v $PWD/model-store:/home/model-server/model-store \ -v $PWD/wf-store:/home/model-server/wf-store \ --net=host \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu + intel/intel-optimized-pytorch:2.3.0-serving-cpu ``` > [!TIP] diff --git a/examples/cpu/serving/triton/Dockerfile b/examples/cpu/serving/triton/Dockerfile index 73d9fc66d..8897773f5 100644 --- a/examples/cpu/serving/triton/Dockerfile +++ b/examples/cpu/serving/triton/Dockerfile @@ -15,11 +15,11 @@ ARG TORCHAUDIO_VERSION ARG IPEX_VERSION RUN python3 -m pip install --no-cache-dir \ - torch==${PYTORCH_VERSION:-2.1.0+cpu} \ - torchaudio==${TORCHAUDIO_VERSION:-2.1.0+cpu} \ - torchvision==${TORCHVISION_VERSION:-0.16.0+cpu} \ + torch==${PYTORCH_VERSION:-2.3.0+cpu} \ + torchaudio==${TORCHAUDIO_VERSION:-2.3.0+cpu} \ + torchvision==${TORCHVISION_VERSION:-0.18.0+cpu} \ -f https://download.pytorch.org/whl/cpu/torch_stable.html \ - intel_extension_for_pytorch==${IPEX_VERSION:-2.1.0}+cpu \ + intel_extension_for_pytorch==${IPEX_VERSION:-2.3.0}+cpu \ -f https://developer.intel.com/ipex-whl-stable-cpu \ configargparse \ intel-openmp \ diff --git a/examples/cpu/serving/triton/requirements.txt b/examples/cpu/serving/triton/requirements.txt index 3dcefb4b2..dc40f4345 100644 --- a/examples/cpu/serving/triton/requirements.txt +++ b/examples/cpu/serving/triton/requirements.txt @@ -1,7 +1,7 @@ -torch==2.2.0 --index-url https://download.pytorch.org/whl/cpu -torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cpu -torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cpu -intel_extension_for_pytorch==2.2.0 +torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu +torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cpu +torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu +intel_extension_for_pytorch==2.3.0 transformers==4.38.1 tritonclient[all]==2.41.1 intel-openmp==2024.0.2 diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 04f05dc49..12aea9a7f 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -22,25 +22,31 @@ def rotary_embedding( ): r""" Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) - on the `query ` or `key` before their multi-head attention computation. + on the `query ` or `key` before their multi-head attention computation. + Args: - - query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of - [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). - - sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. - - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. - - head_dim (int) : head dim from the input shape. - - rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, - so the offset is 1. - if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, - so the offset is rotary_dim/2. - - position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids - for the input. The shape should be [batch size, sequence length]. + query, key (torch.Tensor) : inputs to be applied with position embeddings, + taking shape of [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). + sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor + generated to be applied on query/key. + rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + head_dim (int) : head dim from the input shape. + rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. + + if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + + position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. + The according position_ids for the input. The shape should be [batch size, sequence length]. + Return - - query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim]. + query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim]. """ + return RotaryEmbedding.apply_function( query, key, sin, cos, rotary_dim, rotary_half, position_ids ) @@ -50,12 +56,14 @@ def rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor, eps: float): r""" Applies RMSnorm on the input (hidden states). (see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L76) + Args: - - hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. - - weight (torch.Tensor): the weight to apply RMSnorm. - - eps (float) : the variance_epsilon to apply RMSnorm. + hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. + weight (torch.Tensor): the weight to apply RMSnorm. + eps (float) : the variance_epsilon to apply RMSnorm. """ + return RMSNorm.apply_function(hidden_states, weight, eps) @@ -69,12 +77,14 @@ def fast_layer_norm( r""" Applies PyTorch Layernorm (see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) on the input (hidden states). + Args: - - hidden_states(torch.Tensor) : the input tensor to apply normalization. - - normalized_shape (int or list) or torch.Size) input shape from an expected input of size. - - weight (torch.Tensor): the weight to apply normalization. - - bias (torch.Tensor): an additive bias for normalization. - - eps (float): a value added to the denominator for numerical stability. + hidden_states(torch.Tensor) : the input tensor to apply normalization. + normalized_shape (int or list) or torch.Size) input shape from an + expected input of size. + weight (torch.Tensor): the weight to apply normalization. + bias (torch.Tensor): an additive bias for normalization. + eps (float): a value added to the denominator for numerical stability. """ @@ -105,33 +115,49 @@ def indirect_access_kv_cache_attention( buffers(key and value use different buffers) to store all key/value hidden states and beam index information. It can use beam index history to decide which beam should be used by a timestamp and this information will generate an offset to access the kv_cache buffer. + Data Format: - - The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], - the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. - All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. - - forward - - query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). - - layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - beam-idx: history beam idx, shape:(max_seq, beam*batch); - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). - - head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. - - attention_mask(torch.Tensor): Attention mask information. - - text_max_length (int) : the max length of kv cache to be used for generation (allocate the pre-cache buffer). + + The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], + the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. + All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. + + Args: + query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). + key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). + value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). + scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). + layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). + + - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - beam-idx: history beam idx, shape:(max_seq, beam*batch); + + - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). + + head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. + attention_mask(torch.Tensor): Attention mask information. + text_max_length (int) : the max length of kv cache to be used for generation + (allocate the pre-cache buffer). Return: - - attn_output: weighted value which is the output of scale dot product. shape (beam*batch, seq_len, head_num, head_size). - - attn_weights: The output tensor of the first matmul in scale dot product which is not supported by kernel now. - - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). + attn_output: weighted value which is the output of scale dot product. + shape (beam*batch, seq_len, head_num, head_size). + + attn_weights: the output tensor of the first matmul in scale dot product + which is not supported by kernel now. + + new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). Notes: - - How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model - see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model + see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + + .. highlight:: python + .. code-block:: python + def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: @@ -143,6 +169,7 @@ def _reorder_cache( return past_key_values """ + return IndirectAccessKVCacheAttention.apply_function( query, key, @@ -176,25 +203,33 @@ def varlen_attention( ): r""" Applies PyTorch scaled_dot_product_attention on the inputs of query, key and value - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), - and accept the variant (different) sequence length among the query, key and value. + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), + and accept the variant (different) sequence length among the query, key and value. + + This module does not have args for `module init`. + + `forward()` Args: - module init: this module does not have args for module init - forward: - - query (torch.Tensor): shape [query_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - key (torch.Tensor): shape [key_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - value (torch.Tensor): shape [value_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - out (torch.Tensor): buffer to get the results, the shape is the same as query. - - seqlen_q (torch.Tensor): shape [batch_size + 1], points the current query_tokens among total sequence length. - - seqlen_k (torch.Tensor): shape [batch_size + 1], points the current key_tokens among total sequence length. - - max_seqlen_q (int): max/total sequence length of query. - - max_seqlen_k (int): max/total sequence length of key. - - pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. - - softmax_scale (float): scaling factor applied is prior to softmax. - - is_causal (bool): whether to apply causal attention masking, default is True. + query (torch.Tensor): shape [query_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + key (torch.Tensor): shape [key_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + value (torch.Tensor): shape [value_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + out (torch.Tensor): buffer to get the results, the shape is the same as query. + seqlen_q (torch.Tensor): shape [batch_size + 1], + points the current query_tokens among total sequence length. + seqlen_k (torch.Tensor): shape [batch_size + 1], + points the current key_tokens among total sequence length. + max_seqlen_q (int): max/total sequence length of query. + max_seqlen_k (int): max/total sequence length of key. + pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. + softmax_scale (float): scaling factor applied is prior to softmax. + is_causal (bool): whether to apply causal attention masking, default is True. """ + return VarlenAttention.apply_function( query, key, diff --git a/intel_extension_for_pytorch/llm/modules/linear_fusion.py b/intel_extension_for_pytorch/llm/modules/linear_fusion.py index 380cf8de4..26e4c99d3 100644 --- a/intel_extension_for_pytorch/llm/modules/linear_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/linear_fusion.py @@ -53,12 +53,21 @@ def init_on_device(self, x, op_type): class LinearSilu(IPEXLinearFusion): r""" Applies a linear transformation to the `input` data, and then apply PyTorch SILU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) on the result: + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) + on the result: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.silu(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with silu. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with silu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -66,6 +75,7 @@ class LinearSilu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -80,15 +90,25 @@ def forward(self, x): class Linear2SiluMul(IPEXLinear2Fusion): r""" - Applies two linear transformation to the `input` data (`linear_s` and `linear_m`), then apply PyTorch SILU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) on the result from `linear_s` - , and multiplies the result from `linear_m`: + Applies two linear transformation to the `input` data (`linear_s` and + `linear_m`), then apply PyTorch SILU + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) + on the result from `linear_s`, and multiplies the result from `linear_m`: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.silu(linear_s(input)) * linear_m(input) + Args: - linear_s (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with silu. - linear_m (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with mul. + linear_s (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with silu. + linear_m (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with mul. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_s_module = torch.nn.Linear(4096, 4096) @@ -97,6 +117,7 @@ class Linear2SiluMul(IPEXLinear2Fusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear_s, linear_m): @@ -112,12 +133,21 @@ def forward(self, x): class LinearRelu(IPEXLinearFusion): r""" Applies a linear transformation to the `input` data, and then apply PyTorch RELU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.relu.html) on the result: + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.relu.html) + on the result: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.relu(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with relu. + linear (torch.nn.Linear module) : the original torch.nn.Linear module + to be fused with relu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -125,6 +155,7 @@ class LinearRelu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -142,11 +173,19 @@ class LinearNewGelu(IPEXLinearFusion): Applies a linear transformation to the `input` data, and then apply NewGELUActivation (see https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L50) on the result: + + .. highlight:: python + .. code-block:: python + result = NewGELUActivation(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with new_gelu. + linear (torch.nn.Linear module) : the original torch.nn.Linear module + to be fused with new_gelu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -154,6 +193,7 @@ class LinearNewGelu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -169,12 +209,21 @@ def forward(self, x): class LinearGelu(IPEXLinearFusion): r""" Applies a linear transformation to the `input` data, and then apply PyTorch GELU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.gelu.html) on the result: + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.gelu.html) + on the result: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.gelu(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with gelu. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with gelu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -182,6 +231,7 @@ class LinearGelu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -199,12 +249,19 @@ class LinearSiluMul(IPEXLinearFusion): Applies a linear transformation to the `input` data, then apply PyTorch SILU (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) on the result, and multiplies the result by `other`: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.silu(linear(input)) * other + Args: linear (torch.nn.Linear module) : the original torch.nn.Linear module to - be fused with silu and mul. + be fused with silu and mul. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -213,6 +270,7 @@ class LinearSiluMul(IPEXLinearFusion): >>> input = torch.randn(4096, 4096) >>> other = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other) + """ def __init__(self, linear): @@ -227,12 +285,21 @@ def forward(self, x, y): class LinearMul(IPEXLinearFusion): r""" - Applies a linear transformation to the `input` data, and then multiplies the result by `other`: + Applies a linear transformation to the `input` data, and then multiplies + the result by `other`: + + .. highlight:: python + .. code-block:: python + result = linear(input) * other + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with mul. + linear (torch.nn.Linear module) : the original torch.nn.Linear module + to be fused with mul. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -241,6 +308,7 @@ class LinearMul(IPEXLinearFusion): >>> input = torch.randn(4096, 4096) >>> other = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other) + """ def __init__(self, linear): @@ -255,12 +323,21 @@ def forward(self, x, y): class LinearAdd(IPEXLinearFusion): r""" - Applies a linear transformation to the `input` data, and then add the result by `other`: + Applies a linear transformation to the `input` data, + and then add the result by `other`: + + .. highlight:: python + .. code-block:: python + result = linear(input) + other + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with add. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with add. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -269,6 +346,7 @@ class LinearAdd(IPEXLinearFusion): >>> input = torch.randn(4096, 4096) >>> other = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other) + """ def __init__(self, linear): @@ -283,12 +361,21 @@ def forward(self, x, y): class LinearAddAdd(IPEXLinearFusion): r""" - Applies a linear transformation to the `input` data, and then add the result by `other_1` and `other_2`: + Applies a linear transformation to the `input` data, + and then add the result by `other_1` and `other_2`: + + .. highlight:: python + .. code-block:: python + result = linear(input) + other_1 + other_2 + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with add and add. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with add and add. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -298,6 +385,7 @@ class LinearAddAdd(IPEXLinearFusion): >>> other_1 = torch.randn(4096, 4096) >>> other_2 = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other_1, other_2) + """ def __init__(self, linear): diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index 940fea611..1589b1444 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -7,7 +7,10 @@ class RotaryEmbedding(nn.Module): r""" [module init and forward] Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) - on the `query ` or `key` before their multi-head attention computation. + on the ``query`` or ``key`` before their multi-head attention computation. + + `module init` + Args: max_position_embeddings (int): size (max) of the position embeddings. pos_embd_dim (int): dimension of the position embeddings. @@ -19,20 +22,22 @@ class RotaryEmbedding(nn.Module): long_factor and short_factor, see details: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json#L23. - forward: - - input (torch.Tensor) : input to be applied with position embeddings, - taking shape of [batch size, sequence length, num_head/num_kv_head, head_dim] - (as well as the output shape). - - position_ids (torch.Tensor): the according position_ids for the input. - The shape should be [batch size, sequence length. In some cases, - there is only one element which the past_kv_length, and position id - can be constructed by past_kv_length + current_position. - - num_head (int) : head num from the input shape. - - head_dim (int) : head dim from the input shape. - - offset (int) : the offset value. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, - so the offset is 1. For llama, cos/sin is applied to the neighboring rotary_dim elements, - so the offset is rotary_dim/2. - - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + `forward()` + + Args: + input (torch.Tensor) : input to be applied with position embeddings, + taking shape of [batch size, sequence length, num_head/num_kv_head, head_dim] + (as well as the output shape). + position_ids (torch.Tensor): the according position_ids for the input. + The shape should be [batch size, sequence length. In some cases, + there is only one element which the past_kv_length, and position id + can be constructed by past_kv_length + current_position. + num_head (int) : head num from the input shape. + head_dim (int) : head dim from the input shape. + offset (int) : the offset value. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. For llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. Examples: >>> # module init: @@ -42,25 +47,29 @@ class RotaryEmbedding(nn.Module): >>> position_ids = torch.arange(32).unsqueeze(0) >>> query_rotery = rope_module(query, position_ids, 16, 256, 1, 64) - [Direct function call] This module also provides a `.apply_function` function call to be used on query and key - at the same time without initializing the module (assume rotary embedding - sin/cos values are provided). + [Direct function call] This module also provides a `.apply_function` function call + to be used on query and key at the same time without initializing the module + (assume rotary embedding sin/cos values are provided). + + `apply_function()` + Args: - - query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of - [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). - - sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. - - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. - - head_dim (int) : head dim from the input shape. - - rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, - so the offset is 1. - if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, - so the offset is rotary_dim/2. - - position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids - for the input. The shape should be [batch size, sequence length]. - Return - - query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim]. + query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of + [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). + sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. + rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + head_dim (int) : head dim from the input shape. + rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. + if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids + for the input. The shape should be [batch size, sequence length]. + + Return: + query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim]. """ @@ -144,14 +153,9 @@ def apply_function( runtime_module = cls.runtime_ops.get_module_from_device( query.device.type, IPEXCustomOpType.ROPE, False ) - - query_, key_ = runtime_module.rotary_embedding( + query, key = runtime_module.rotary_embedding( query, key, sin, cos, rotary_dim, rotary_half, position_ids ) - - # keep the inplace context as used in TGI - query.copy_(query_) - key.copy_(key_) return query, key @@ -159,16 +163,20 @@ class FastLayerNorm(nn.Module): r""" [module init and forward] Applies PyTorch Layernorm (see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) on the input (hidden states). + + `module init` + Args: - module init: - - normalized_shape ((int or list) or torch.Size) input shape from an expected input of size. - - eps (float): a value added to the denominator for numerical stability. - - weight (torch.Tensor): the weight of Layernorm to apply normalization. - - bias (torch.Tensor): an additive bias for normalization. + normalized_shape ((int or list) or torch.Size) input shape from an expected input of size. + eps (float): a value added to the denominator for numerical stability. + weight (torch.Tensor): the weight of Layernorm to apply normalization. + bias (torch.Tensor): an additive bias for normalization. + + `forward()` - forward: - - hidden_states (torch.Tensor) : input to be applied Layernorm, usually taking shape of - [batch size, sequence length, hidden_size] (as well as the output shape). + Args: + hidden_states (torch.Tensor) : input to be applied Layernorm, usually taking shape of + [batch size, sequence length, hidden_size] (as well as the output shape). Examples: >>> # module init: @@ -179,13 +187,16 @@ class FastLayerNorm(nn.Module): >>> result = layernorm_module(input) [Direct function call] This module also provides a `.apply_function` function call to apply fast layernorm - without initializing the module. + without initializing the module. + + `apply_function()` + Args: - - hidden_states(torch.Tensor) : the input tensor to apply normalization. - - normalized_shape (int or list) or torch.Size) input shape from an expected input of size. - - weight (torch.Tensor): the weight to apply normalization. - - bias (torch.Tensor): an additive bias for normalization. - - eps (float): a value added to the denominator for numerical stability. + hidden_states(torch.Tensor) : the input tensor to apply normalization. + normalized_shape (int or list) or torch.Size) input shape from an expected input of size. + weight (torch.Tensor): the weight to apply normalization. + bias (torch.Tensor): an additive bias for normalization. + eps (float): a value added to the denominator for numerical stability. """ @@ -227,16 +238,20 @@ class RMSNorm(nn.Module): r""" [module init and forward] Applies RMSnorm on the input (hidden states). (see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L76) + + `module init` + Args: - module init: - - hidden_size (int) : the size of the hidden states. - - eps (float) : the variance_epsilon to apply RMSnorm, default using 1e-6. - - weight (torch.Tensor): the weight to apply RMSnorm, default None and will use `torch.ones(hidden_size)`. + hidden_size (int) : the size of the hidden states. + eps (float) : the variance_epsilon to apply RMSnorm, default using 1e-6. + weight (torch.Tensor): the weight to apply RMSnorm, default None + and will use `torch.ones(hidden_size)`. + + `forward()` - forward: - - hidden_states (torch.Tensor) : input to be applied RMSnorm, usually taking shape of - [batch size, sequence length, hidden_size] - (as well as the output shape). + Args: + hidden_states (torch.Tensor) : input to be applied RMSnorm, usually taking shape of + [batch size, sequence length, hidden_size] (as well as the output shape). Examples: >>> # module init: @@ -245,12 +260,15 @@ class RMSNorm(nn.Module): >>> input = torch.randn(1, 32, 4096) >>> result = rmsnorm_module(input) - [Direct function call] This module also provides a `.apply_function` function call to apply RMSNorm without - initializing the module. + [Direct function call] This module also provides a `.apply_function` function + call to apply RMSNorm without initializing the module. + + `apply_function()` + Args: - - hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. - - weight (torch.Tensor): the weight to apply RMSnorm. - - eps (float) : the variance_epsilon to apply RMSnorm. + hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. + weight (torch.Tensor): the weight to apply RMSnorm. + eps (float) : the variance_epsilon to apply RMSnorm. """ @@ -281,23 +299,31 @@ def forward(self, x: torch.Tensor): class VarlenAttention(nn.Module): r""" [module init and forward] Applies PyTorch scaled_dot_product_attention on the inputs of query, key and value - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), - and accept the variant (different) sequence length among the query, key and value. + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), + and accept the variant (different) sequence length among the query, key and value. + + This module does not have args for `module init`. + + `forward()` Args: - module init: this module does not have args for module init - forward: - - query (torch.Tensor): shape [query_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - key (torch.Tensor): shape [key_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - value (torch.Tensor): shape [value_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - out (torch.Tensor): buffer to get the results, the shape is the same as query. - - seqlen_q (torch.Tensor): shape [batch_size + 1], points the current query_tokens among total sequence length. - - seqlen_k (torch.Tensor): shape [batch_size + 1], points the current key_tokens among total sequence length. - - max_seqlen_q (int): max/total sequence length of query. - - max_seqlen_k (int): max/total sequence length of key. - - pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. - - softmax_scale (float): scaling factor applied is prior to softmax. - - is_causal (bool): whether to apply causal attention masking, default is True. + query (torch.Tensor): shape [query_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + key (torch.Tensor): shape [key_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + value (torch.Tensor): shape [value_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + out (torch.Tensor): buffer to get the results, the shape is the same as query. + seqlen_q (torch.Tensor): shape [batch_size + 1], points the + current query_tokens among total sequence length. + seqlen_k (torch.Tensor): shape [batch_size + 1], points the + current key_tokens among total sequence length. + max_seqlen_q (int): max/total sequence length of query. + max_seqlen_k (int): max/total sequence length of key. + pdropout (float): dropout probability; if greater than 0.0, + dropout is applied, default is 0.0. + softmax_scale (float): scaling factor applied is prior to softmax. + is_causal (bool): whether to apply causal attention masking, default is True. Examples: >>> # module init: @@ -315,10 +341,10 @@ class VarlenAttention(nn.Module): >>> softmax_scale = 0.5 >>> varlenAttention_module(query, key, value, out, seqlen_q, seqlen_k, max_seqlen_q, max_seqlen_k, pdropout, softmax_scale) - [Direct function call] This module also provides a `.apply_function` function call to apply VarlenAttention without - initializing the module. - Args: - - The parameters are the same as the forward call. + [Direct function call] This module also provides a `.apply_function` + function call to apply VarlenAttention without initializing the module. + + The parameters of `apply_function()` are the same as the `forward()` call. """ @@ -409,58 +435,65 @@ class PagedAttention: for key/value cache. The basic logic as following figure. Firstly, The DRAM buffer which includes num_blocks are pre-allocated to store key or value cache. For every block, block_size tokens can be stored. In the forward pass, the cache manager will firstly allocate some slots from this buffer and use reshape_and_cache API to store - the key/value and then use single_query_cached_kv_attention API to do the scale-dot-product of MHA. + the key/value and then use single_query_cached_kv_attention API to do the scale-dot-product of MHA. The block is basic allocation unit of paged attention and the token intra-block are stored one-by-one. The block tables are used to map the logical block of sequence into the physical block. [class method]: reshape_and_cache - ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) + ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) This operator is used to store the key/value token states into the pre-allcated kv_cache buffers of paged attention. + Args: - - key (torch.Tensor): The keytensor. The shape should be [num_seqs, num_heads, head_size]. - - value (torch.Tensor): The value tensor. The shape should be [num_seqs, num_heads, head_size]. - - key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - value_cache (torch.Tensor): The pre-allocated buffer to store the value cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - slot_mapping (torch.Tensor): It stores the position to store the key/value in the pre-allocated buffers. - The shape should be the number of sequences. For sequence _i_, the slot_mapping[i]//block_number - can get the block index, and the slot_mapping%block_size can get the offset of this block. + key (torch.Tensor): The keytensor. The shape should be [num_seqs, num_heads, head_size]. + value (torch.Tensor): The value tensor. The shape should be [num_seqs, num_heads, head_size]. + key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + value_cache (torch.Tensor): The pre-allocated buffer to store the value cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + slot_mapping (torch.Tensor): It stores the position to store the key/value in the pre-allocated buffers. + The shape should be the number of sequences. For sequence ``i``, the ``slot_mapping[i] // block_number`` + can get the block index, and the ``slot_mapping % block_size`` can get the offset of this block. [class method]: single_query_cached_kv_attention - ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( - out, - query, - key_cache, - value_cache, - head_mapping, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes - ) + + .. highlight:: python + .. code-block:: python + + ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( + out, + query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes + ) This operator is used to be calculated the scale-dot-product based on the paged attention. + Args: - - out (torch.Tensor): The output tensor with shape of [num_seqs, num_heads, head_size]. where the num_seqs - is the number of the sequence in this batch. The num_heads means the number of query - head. head_size means the head dimension. - - query (torch.Tensor): The query tensor. The shape should be [num_seqs, num_heads, head_size]. - - key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - value_cache(torch.Tensor): The pre-allocated buffer to store the value cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - head_mapping(torch.Tensor): The mapping from the query head to the kv head. The shape should be - the number of query heads. - - scale (float): The scale used by the scale-dot-product. In general, it is: float(1.0 / (head_size ** 0.5)). - - block_tables:(torch.Tensor): The mapping table used to mapping the logical sequence to the physical sequence. - The shape should be [num_seqs, max_num_blocks_per_seq]. - - context_lens (torch.Tensor): The sequence length for every sequence. The size is [num_seqs]. - - block_size (int): The block size which means the number of token in every block. - - max_context_len (int): The max sequence length. - - alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads). + out (torch.Tensor): The output tensor with shape of [num_seqs, num_heads, head_size], + where the num_seqs is the number of the sequence in this batch. The num_heads + means the number of query head. head_size means the head dimension. + query (torch.Tensor): The query tensor. The shape should be [num_seqs, num_heads, head_size]. + key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + value_cache(torch.Tensor): The pre-allocated buffer to store the value cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + head_mapping(torch.Tensor): The mapping from the query head to the kv head. + The shape should be the number of query heads. + scale (float): The scale used by the scale-dot-product. + In general, it is: ``float(1.0 / (head_size ** 0.5))``. + block_tables:(torch.Tensor): The mapping table used to mapping the logical sequence + to the physical sequence. The shape should be [num_seqs, max_num_blocks_per_seq]. + context_lens (torch.Tensor): The sequence length for every sequence. The size is [num_seqs]. + block_size (int): The block size which means the number of token in every block. + max_context_len (int): The max sequence length. + alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads). """ @@ -477,13 +510,7 @@ def reshape_and_cache( ): return cls.runtime_ops.get_module_from_device( key.device.type, IPEXCustomOpType.PAGED_ATTENTION, False - ).reshape_and_cache( - key, - value, - key_cache, - value_cache, - slot_mapping.int() if slot_mapping.dtype is torch.long else slot_mapping, - ) + ).reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) @classmethod def single_query_cached_kv_attention( @@ -527,37 +554,55 @@ class IndirectAccessKVCacheAttention(nn.Module): buffers(key and value use different buffers) to store all key/value hidden states and beam index information. It can use beam index history to decide which beam should be used by a timestamp and this information will generate an offset to access the kv_cache buffer. + Data Format: - - The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], - the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. - All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. - [Module init and forward] + The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], + the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. + All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. + + `module init` + Args: - module init - - text_max_length (int) : the max length of kv cache to be used for generation (allocate the pre-cache buffer). - - forward - - query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). - - layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - beam-idx: history beam idx, shape:(max_seq, beam*batch); - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). - - head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. - - attention_mask(torch.Tensor): Attention mask information. + text_max_length (int) : the max length of kv cache to be used + for generation (allocate the pre-cache buffer). + + `forward()` + + Args: + query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). + key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). + value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). + scale_attn (float):scale used by the attention layer. should be ``sqrt(head_size)``. + layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). + + - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - beam-idx: history beam idx, shape:(max_seq, beam*batch); + + - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). + + head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. + attention_mask(torch.Tensor): Attention mask information. Return: - - attn_output: weighted value which is the output of scale dot product. shape (beam*batch, seq_len, head_num, head_size). - - attn_weights: The output tensor of the first matmul in scale dot product which is not supported by kernel now. - - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). + attn_output: Weighted value which is the output of scale dot product. + shape (beam*batch, seq_len, head_num, head_size). + + attn_weights: The output tensor of the first matmul in scale dot product + which is not supported by kernel now. + + new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). Notes: - - How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model - see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model + see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + + .. highlight:: python + .. code-block:: python + def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: @@ -568,10 +613,10 @@ def _reorder_cache( layer_past[3][layer_past[0].size(-2) - 1] = beam_idx return past_key_values - [Direct function call] This module also provides a `.apply_function` function call to apply IndirectAccessKVCacheAttention - without initializing the module. - Args: - - The parameters are the same as the forward call. + [Direct function call] This module also provides a `.apply_function` function call + to apply IndirectAccessKVCacheAttention without initializing the module. + + The parameters of `apply_function()` are the same as the `forward()` call. """ From 1f688511c0dfe4af8b50fed13c16f3a98bcc84bd Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Wed, 22 May 2024 08:08:36 +0800 Subject: [PATCH 09/21] Update dependency_version.yml 20240522 (#2915) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 7ffa00ac3..68d27d4e9 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240520+cpu + version: 2.4.0.dev20240521+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240520+cpu + version: 2.2.0.dev20240521+cpu torchvision: - version: 0.19.0.dev20240520+cpu + version: 0.19.0.dev20240521+cpu transformers: version: 4.38.1 From 21b50308b852f942bf4a0d645e8c1a74dd6232c8 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Wed, 22 May 2024 10:26:31 +0800 Subject: [PATCH 10/21] Fix iakv regression (#2900) * Fix iakv regression * Remove unuse loop --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 250 +++++++++--------- 1 file changed, 125 insertions(+), 125 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index e497942b5..205c3c31f 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -557,10 +557,14 @@ scale_dot_product_for_indirect_access_kv_cache( auto thread_numbers = omp_get_max_threads(); auto max_parallel_parts = thread_numbers * 4; + auto target_block_size = 32L; + if (bs <= 32 and seq_len < 65536) { + target_block_size = 1L; + } auto kv_block_size = bs * head_num >= max_parallel_parts ? seq_len : std::max(seq_len / max_parallel_parts, 1L); - kv_block_size = std::min(kv_block_size, 32L); + kv_block_size = std::min(kv_block_size, target_block_size); auto kv_block_count = (seq_len + kv_block_size - 1) / kv_block_size; auto need_update_beam_idx = offset > 0 and bs > 1; auto b_ptr = beam_idx.data_ptr(); @@ -585,37 +589,48 @@ scale_dot_product_for_indirect_access_kv_cache( for (auto hi = 0; hi < head_num; hi++) { auto k_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - k_start); + auto query_ti = 0; for (auto ti = k_start; ti < k_start + block_size; ti++) { - for (auto query_ti = 0; query_ti < cur_len; query_ti++) { - auto kv_hi = hi / group_size; // maping the query head to - // key/value head to support MGA/MQA - auto q_ptr_start = q_ptr + - (bi * cur_len + query_ti) * head_num * head_size + - hi * head_size; - auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; - auto attn_w_pos = - attn_w_ptr + attn_w_stride + query_ti * seq_len + ti; - attn_w_pos[0] = 0.0f; - auto kc_token_start = ti * kc_token_stride; - auto kc_t_beam_start = kc_token_start; - auto beam = need_update_beam_idx ? new_beam_idx[bi][ti] : 0; - if (ti > - query_ti + offset) { // only caculate the innerproduct for - // the past token and current token - attn_w_pos[0] = -10000.0f; - } else if (ti == query_ti + offset) { // caculate the innerproduct - // for the current token and - // store the key - if (cur_len > 1) { // this may occur for processing the promt - auto beam_size = beam_batch / bs; - // need to store key accross beam - kc_t_beam_start = - kc_t_beam_start + bi * beam_size * kv_head * head_size; - } else { - kc_t_beam_start = kc_t_beam_start + bi * kv_head * head_size; - } - auto kc_head_start = - k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + auto kv_hi = hi / group_size; // maping the query head to + // key/value head to support MGA/MQA + auto q_ptr_start = q_ptr + + (bi * cur_len + query_ti) * head_num * head_size + + hi * head_size; + auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; + auto attn_w_pos = + attn_w_ptr + attn_w_stride + query_ti * seq_len + ti; + attn_w_pos[0] = 0.0f; + auto kc_token_start = ti * kc_token_stride; + auto kc_t_beam_start = kc_token_start; + auto beam = need_update_beam_idx ? new_beam_idx[bi][ti] : 0; + if (ti > query_ti + offset) { // only caculate the innerproduct for + // the past token and current token + attn_w_pos[0] = -10000.0f; + } else if (ti == query_ti + offset) { // caculate the innerproduct + // for the current token and + // store the key + if (cur_len > 1) { // this may occur for processing the promt + auto beam_size = beam_batch / bs; + // need to store key accross beam + kc_t_beam_start = + kc_t_beam_start + bi * beam_size * kv_head * head_size; + } else { + kc_t_beam_start = kc_t_beam_start + bi * kv_head * head_size; + } + auto kc_head_start = + k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + auto k_ptr_start = k_ptr + + (bi * cur_len + ti - offset) * kv_head * head_size + + kv_hi * head_size; + reduce_head( + q_ptr_start, + k_ptr_start, + attn_w_pos, + head_size, + true, + kc_head_start); + } else { // caculate the innerproduct for the past token + if (ti >= offset) { auto k_ptr_start = k_ptr + (bi * cur_len + ti - offset) * kv_head * head_size + kv_hi * head_size; @@ -624,38 +639,24 @@ scale_dot_product_for_indirect_access_kv_cache( k_ptr_start, attn_w_pos, head_size, - true, - kc_head_start); - } else { // caculate the innerproduct for the past token - if (ti >= offset) { - auto k_ptr_start = k_ptr + - (bi * cur_len + ti - offset) * kv_head * head_size + - kv_hi * head_size; - reduce_head( - q_ptr_start, - k_ptr_start, - attn_w_pos, - head_size, - false, - nullptr); - } else { + false, + nullptr); + } else { + kc_t_beam_start = kc_t_beam_start + beam * kv_head * head_size; + if (cur_len > 1) { + auto beam_size = beam_batch / bs; kc_t_beam_start = - kc_t_beam_start + beam * kv_head * head_size; - if (cur_len > 1) { - auto beam_size = beam_batch / bs; - kc_t_beam_start = - kc_t_beam_start + bi * beam_size * kv_head * head_size; - } - auto kc_head_start = - k_cache_ptr + kc_t_beam_start + kv_hi * head_size; - reduce_head( - q_ptr_start, - kc_head_start, - attn_w_pos, - head_size, - false, - nullptr); + kc_t_beam_start + bi * beam_size * kv_head * head_size; } + auto kc_head_start = + k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + reduce_head( + q_ptr_start, + kc_head_start, + attn_w_pos, + head_size, + false, + nullptr); } } } @@ -742,85 +743,84 @@ scale_dot_product_for_indirect_access_kv_cache( thread_id = omp_get_thread_num(); auto v_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - v_start); + auto query_ti = 0; for (auto vi = v_start; vi < v_start + block_size; vi++) { - for (auto query_ti = 0; query_ti < cur_len; query_ti++) { - auto kv_hi = hi / group_size; // maping the query head to - // key/value head to support MGA/MQA - auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; - auto attn_w_query_start = - attn_w_ptr + attn_w_stride + query_ti * seq_len; - // calculate weighted value and store the result to attn_outs[bs, - // head_num, cur_len, head_size] - auto attn_out_head_stride = thread_id * attn_outs_stride_priv + - (bi * head_num + hi) * cur_len * head_size; - auto attn_out_start = private_attn_out_ptr + - attn_out_head_stride + query_ti * head_size; + auto kv_hi = hi / group_size; // maping the query head to + // key/value head to support MGA/MQA + auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; + auto attn_w_query_start = + attn_w_ptr + attn_w_stride + query_ti * seq_len; + // calculate weighted value and store the result to attn_outs[bs, + // head_num, cur_len, head_size] + auto attn_out_head_stride = thread_id * attn_outs_stride_priv + + (bi * head_num + hi) * cur_len * head_size; + auto attn_out_start = private_attn_out_ptr + attn_out_head_stride + + query_ti * head_size; - auto vc_token_start = vi * kc_token_stride; - auto beam = need_update_beam_idx ? new_beam_idx[bi][vi] : 0; - if (vi == query_ti + offset) { // caculate the attention values - // for the current token - auto vc_t_beam_start = vc_token_start; - if (cur_len > 1) { // this may occur for processing the promt + auto vc_token_start = vi * kc_token_stride; + auto beam = need_update_beam_idx ? new_beam_idx[bi][vi] : 0; + if (vi == query_ti + offset) { // caculate the attention values + // for the current token + auto vc_t_beam_start = vc_token_start; + if (cur_len > 1) { // this may occur for processing the promt + auto beam_size = beam_batch / bs; + // removed the redundant computation, need to store key + // accross beam + vc_t_beam_start = + vc_t_beam_start + bi * beam_size * kv_head * head_size; + } else { + vc_t_beam_start = vc_t_beam_start + bi * kv_head * head_size; + } + auto v_cache_head_start = + v_cache_ptr + vc_t_beam_start + kv_hi * head_size; + auto v_ptr_start = v_ptr + + (bi * cur_len + vi - offset) * kv_head * head_size + + kv_hi * head_size; + mul_attenion_weights_and_value_of_head( + attn_w_query_start[vi], + v_ptr_start, + attn_out_start, + head_size, + true, + v_cache_head_start, + flag_access[thread_id][bi][hi]); + } else if (vi < query_ti + offset) { // caculate attention + // values for the past + // token + if (vi >= offset) { + auto v_ptr_start = v_ptr + + (bi * cur_len + vi - offset) * kv_head * head_size + + kv_hi * head_size; + mul_attenion_weights_and_value_of_head( + attn_w_query_start[vi], + v_ptr_start, + attn_out_start, + head_size, + false, + nullptr, + flag_access[thread_id][bi][hi]); + } else { + auto vc_t_beam_start = + vc_token_start + beam * kv_head * head_size; + if (cur_len > 1) { auto beam_size = beam_batch / bs; - // removed the redundant computation, need to store key - // accross beam vc_t_beam_start = vc_t_beam_start + bi * beam_size * kv_head * head_size; - } else { - vc_t_beam_start = vc_t_beam_start + bi * kv_head * head_size; } auto v_cache_head_start = v_cache_ptr + vc_t_beam_start + kv_hi * head_size; - auto v_ptr_start = v_ptr + - (bi * cur_len + vi - offset) * kv_head * head_size + - kv_hi * head_size; mul_attenion_weights_and_value_of_head( attn_w_query_start[vi], - v_ptr_start, + v_cache_head_start, attn_out_start, head_size, - true, - v_cache_head_start, + false, + nullptr, flag_access[thread_id][bi][hi]); - } else if (vi < query_ti + offset) { // caculate attention - // values for the past - // token - if (vi >= offset) { - auto v_ptr_start = v_ptr + - (bi * cur_len + vi - offset) * kv_head * head_size + - kv_hi * head_size; - mul_attenion_weights_and_value_of_head( - attn_w_query_start[vi], - v_ptr_start, - attn_out_start, - head_size, - false, - nullptr, - flag_access[thread_id][bi][hi]); - } else { - auto vc_t_beam_start = - vc_token_start + beam * kv_head * head_size; - if (cur_len > 1) { - auto beam_size = beam_batch / bs; - vc_t_beam_start = - vc_t_beam_start + bi * beam_size * kv_head * head_size; - } - auto v_cache_head_start = - v_cache_ptr + vc_t_beam_start + kv_hi * head_size; - mul_attenion_weights_and_value_of_head( - attn_w_query_start[vi], - v_cache_head_start, - attn_out_start, - head_size, - false, - nullptr, - flag_access[thread_id][bi][hi]); - } } - if (flag_access[thread_id][bi][hi] == 0) - flag_access[thread_id][bi][hi] = 1; } + if (flag_access[thread_id][bi][hi] == 0) + flag_access[thread_id][bi][hi] = 1; } } } From f95244a66db1d68c28536327dc5e1c12657eb1fd Mon Sep 17 00:00:00 2001 From: Xu Han Date: Wed, 22 May 2024 13:25:09 +0800 Subject: [PATCH 11/21] remove ipex cpu module's python dependency. (#2911) (#2914) * correct all_reduce schema * remove ipex cpu module's python dependency. --------- Co-authored-by: blzheng --- csrc/cpu/CMakeLists.txt | 8 -------- csrc/cpu/aten/kernels/MoEKrnl.cpp | 34 +++++++++++-------------------- tests/cpu/cpp/CMakeLists.txt | 8 -------- 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt index 74c7057d3..460621a3a 100644 --- a/csrc/cpu/CMakeLists.txt +++ b/csrc/cpu/CMakeLists.txt @@ -251,14 +251,6 @@ if(BUILD_STRIPPED_BIN) set_target_properties(${PLUGIN_NAME_CPU} PROPERTIES LINK_FLAGS_RELEASE -s) endif() -find_package(PythonLibs) -if(${PYTHONLIBS_FOUND}) - target_link_libraries(${PLUGIN_NAME_CPU} PUBLIC ${PYTHON_LIBRARIES}) -endif() - -find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") -target_link_libraries(${PLUGIN_NAME_CPU} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) - install(TARGETS ${PLUGIN_NAME_CPU} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/csrc/cpu/aten/kernels/MoEKrnl.cpp b/csrc/cpu/aten/kernels/MoEKrnl.cpp index ea982318e..80d3ae2fa 100644 --- a/csrc/cpu/aten/kernels/MoEKrnl.cpp +++ b/csrc/cpu/aten/kernels/MoEKrnl.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "tpp/kernels/TPPGEMMKrnl.h" @@ -16,6 +15,15 @@ namespace cpu { namespace { +at::Tensor call_AllReduce(const at::Tensor& self) { + static auto op_allreduce = + c10::Dispatcher::singleton() + .findSchemaOrThrow("deepspeed_comm::all_reduce", "") + .typed(); + auto ret = op_allreduce.call(self); + return ret; +} + at::Tensor mixtral_moe_tpp_kernl_impl( const at::Tensor& hidden_states, const at::Tensor& top_x, @@ -46,13 +54,7 @@ at::Tensor mixtral_moe_tpp_kernl_impl( tpp_linear_nobias_forward_cpu(curr_state, down_wei, c10::nullopt); } if (is_distributed) { - py::gil_scoped_acquire acquire; - py::function allreduce = py::module_::import("torch") - .attr("ops") - .attr("deepspeed_comm") - .attr("all_reduce"); - allreduce(curr_state); - py::gil_scoped_release release; + call_AllReduce(curr_state); } curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); @@ -98,13 +100,7 @@ at::Tensor mixtral_moe_kernl_impl( c10::nullopt); } if (is_distributed) { - py::gil_scoped_acquire acquire; - py::function allreduce = py::module_::import("torch") - .attr("ops") - .attr("deepspeed_comm") - .attr("all_reduce"); - allreduce(curr_state); - py::gil_scoped_release release; + call_AllReduce(curr_state); } curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); @@ -130,13 +126,7 @@ at::Tensor mixtral_moe_woq_kernl_impl( down_wei); if (is_distributed) { - py::gil_scoped_acquire acquire; - py::function allreduce = py::module_::import("torch") - .attr("ops") - .attr("deepspeed_comm") - .attr("all_reduce"); - allreduce(curr_state); - py::gil_scoped_release release; + call_AllReduce(curr_state); } curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); diff --git a/tests/cpu/cpp/CMakeLists.txt b/tests/cpu/cpp/CMakeLists.txt index fc5dff343..cc299d0a5 100644 --- a/tests/cpu/cpp/CMakeLists.txt +++ b/tests/cpu/cpp/CMakeLists.txt @@ -69,13 +69,5 @@ target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC c10) # Link IPEX target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC intel-ext-pt-cpu) -find_package(PythonLibs) -if(${PYTHONLIBS_FOUND}) - target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC ${PYTHON_LIBRARIES}) -endif() - -find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") -target_link_libraries(${CPU_CPP_TEST_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) - install(TARGETS ${CPU_CPP_TEST_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) From 6582a4ef6fcbf915059fa4350b71ae3bc0be49b5 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Wed, 22 May 2024 18:27:37 +0800 Subject: [PATCH 12/21] load low-precision checkpoints in safetensors format (#2917) * load low-precision checkpoints in safetensors format * Fix lint issue --- examples/cpu/inference/python/llm/run.py | 9 +++++++++ .../llm/single_instance/run_quantization.py | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 16c254520..16448e43f 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -470,6 +470,15 @@ def main(args_in: Optional[List[str]] = None) -> None: ) if args.gptq_legacy_format: quant_cmd.extend(["--gptq-legacy-format"]) + elif args.low_precision_checkpoint != "": + quant_cmd.extend( + [ + "--low-precision-checkpoint", + str(args.low_precision_checkpoint), + ] + ) + if args.gptq_legacy_format: + quant_cmd.extend(["--gptq-legacy-format"]) else: # No need to set group size if args.gptq is true # Group size is read from the checkpoint diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index e5024a201..07446a654 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -867,7 +867,21 @@ def calib_func(prepared_model): group_size=args.group_size, ) if args.low_precision_checkpoint != "": - low_precision_checkpoint = torch.load(args.low_precision_checkpoint) + if args.low_precision_checkpoint.endswith( + ".pt" + ) or args.low_precision_checkpoint.endswith(".pth"): + low_precision_checkpoint = torch.load(args.low_precision_checkpoint) + elif args.low_precision_checkpoint.endswith(".safetensors"): + try: + import safetensors + except ImportError: + print( + "Please install safetensors package to load safetensors checkpoint." + ) + exit(1) + low_precision_checkpoint = safetensors.torch.load_file( + args.low_precision_checkpoint + ) if args.gptq_legacy_format: config_dict = ( ipex.utils.weight_only_quantization._legacy_lowp_checkpoint_config() From f43161d98a49ce056e55f77d73d23b9a89892a53 Mon Sep 17 00:00:00 2001 From: Cao E Date: Thu, 23 May 2024 08:49:32 +0800 Subject: [PATCH 13/21] enable ConcatLinear for fp16 LLM (#2909) --- .../transformers/models/cpu/fusions/linear_fusion.py | 4 ++-- tests/cpu/test_ipex_optimize_transformers.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index f7fe68b10..ed036dac0 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -358,9 +358,9 @@ def __init__(self, module, tpp=False, woq=False): mod, concat_scales, concat_zeros ) elif ( - self.tpp - and hasattr(module, "concat_linear") + hasattr(module, "concat_linear") and module.concat_linear is not None + and (self.tpp or module.concat_linear.weight.dtype == torch.half) ): self.concat_linear = module.concat_linear else: diff --git a/tests/cpu/test_ipex_optimize_transformers.py b/tests/cpu/test_ipex_optimize_transformers.py index b920f9c55..c11b7e93f 100644 --- a/tests/cpu/test_ipex_optimize_transformers.py +++ b/tests/cpu/test_ipex_optimize_transformers.py @@ -1,6 +1,7 @@ import unittest import torch import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core import sys import subprocess import os @@ -145,7 +146,8 @@ def model_replacement_check( with torch.no_grad(): key_hf = ref_m(**input_dict) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): key_ipex = ipex_m(**input_dict) error_message = f"model={m.name}, deployment_mode={deployment_mode}, torchcompile={torchcompile}, return_dict={return_dict}" @@ -160,6 +162,8 @@ def model_replacement_check( def test_model_replacement(self): dtypes = [torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) enable_torchcompile = [False, True] deployment_mode = [True, False] return_dict = [False, True] @@ -168,6 +172,8 @@ def test_model_replacement(self): ): if torchcompile and deployment_mode: continue + if dtype == torch.float16: + _disable_tpp() self.model_replacement_check(m, dtype, jit, torchcompile, return_dict) _disable_tpp() From c6a1de1119e74d807cd9fbb41936b09f1d7e4eb8 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Thu, 23 May 2024 11:23:17 +0800 Subject: [PATCH 14/21] Update dependency_version.yml 20240523 (#2919) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 68d27d4e9..e518c3829 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240521+cpu + version: 2.4.0.dev20240522+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240521+cpu + version: 2.2.0.dev20240522+cpu torchvision: - version: 0.19.0.dev20240521+cpu + version: 0.19.0.dev20240522+cpu transformers: version: 4.38.1 From 91e920692fe81994dd41b5823f06530e65a63dc2 Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Thu, 23 May 2024 14:13:41 +0800 Subject: [PATCH 15/21] check diffusers version in test_stable_diffuser (#2918) --- tests/cpu/test_fx_optimization.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/cpu/test_fx_optimization.py b/tests/cpu/test_fx_optimization.py index 29ec7cb33..bea11e588 100644 --- a/tests/cpu/test_fx_optimization.py +++ b/tests/cpu/test_fx_optimization.py @@ -25,12 +25,15 @@ try: import diffusers - HAS_DIFFUSERS = True + if diffusers.__version__ > "0.25.0": + HAS_DIFFUSERS = False + else: + HAS_DIFFUSERS = True except ImportError: HAS_DIFFUSERS = False except RuntimeError: HAS_DIFFUSERS = False -skipIfNoDIFFUSERS = unittest.skipIf(not HAS_DIFFUSERS, "no diffusers") +skipIfNoDIFFUSERS = unittest.skipIf(not HAS_DIFFUSERS, "no expected diffusers version") class MultipleLinear(torch.nn.Module): From b29f8aa25221e12251d3ead2ce11f56217674141 Mon Sep 17 00:00:00 2001 From: blzheng Date: Thu, 23 May 2024 16:42:38 +0800 Subject: [PATCH 16/21] enable optimized whisper (#2923) --- .../run_accuracy_with_deepspeed.py | 408 +++++++++++++++++- .../run_generation_with_deepspeed.py | 29 +- examples/cpu/inference/python/llm/run.py | 16 +- .../llm/single_instance/run_accuracy.py | 296 ++++++++++++- .../llm/single_instance/run_generation.py | 29 +- .../llm/single_instance/run_quantization.py | 132 +++++- .../python/llm/utils/create_shard_model.py | 2 + .../python/llm/utils/model_class/llm.py | 1 + .../python/llm/utils/model_class/whisper.py | 49 +++ .../transformers/generation/beam_sample.py | 47 ++ .../transformers/generation/beam_search.py | 47 ++ .../transformers/generation/greedy_search.py | 48 +++ .../transformers/generation/sample.py | 48 +++ .../models/cpu/modules/attentions.py | 1 + .../models/cpu/modules/decoder.py | 18 + .../transformers/models/reference/models.py | 206 ++++++++- .../models/reference/modules/attentions.py | 106 +++++ .../models/reference/modules/decoder.py | 152 +++++++ .../transformers/optimize.py | 96 ++++- tests/cpu/hf_configs/whisper/config.json | 144 +++++++ ...test_ipex_optimize_transformers_nightly.py | 13 + 21 files changed, 1856 insertions(+), 32 deletions(-) create mode 100644 examples/cpu/inference/python/llm/utils/model_class/whisper.py create mode 100644 tests/cpu/hf_configs/whisper/config.json diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index b07ce9933..f608de80a 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -19,14 +19,24 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) - +from datasets import load_dataset +from torch.utils.data import DataLoader import sys sys.path.append(sys.path[0] + "/../../") try: + import lmms_eval + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + from lmms_eval import evaluator as lmms_evaluator + from lmms_eval import utils as lmms_utils + from lmms_eval.api.registry import ALL_TASKS + from lmms_eval.tasks import initialize_tasks from llava.model.language_model.llava_llama import ( # noqa F401 LlavaLlamaForCausalLM, ) @@ -43,14 +53,6 @@ DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, ) - import lmms_eval - from lmms_eval.api.instance import Instance - from lmms_eval.api.model import lmms - from lmms_eval.api.registry import register_model - from lmms_eval import evaluator as lmms_evaluator - from lmms_eval import utils as lmms_utils - from lmms_eval.api.registry import ALL_TASKS - from lmms_eval.tasks import initialize_tasks except ImportError: def register_model(name): @@ -85,6 +87,7 @@ def decorator(func): "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -1645,8 +1648,377 @@ def _collate(x): return res +class LibriSpeech: + _DEFAULT_MAX_LENGTH = 2048 + + def __init__( + self, + pretrained: str, + device="cpu", + with_ipex=True, + with_jit=True, + with_greedy=False, + batch_size=1, + max_length=None, + dtype: Optional[Union[str, torch.dtype]] = "auto", + tp_number=1, + config=None, + add_special_tokens=True, + ): + model_id = pretrained + self._device = device + self._batch_size = batch_size + self._with_jit = with_jit + self._with_ipex = with_ipex + self._with_greedy = with_greedy + self._max_length = max_length + self._dtype = dtype + self._tp_number = tp_number + self.add_special_tokens = add_special_tokens + + load_dtype = torch.float32 + infer_dtype = torch.float32 + if args.quant_with_amp or dtype == "bfloat16": + load_dtype = torch.bfloat16 + infer_dtype = torch.bfloat16 + else: + if dtype == "float16": + load_dtype = torch.half + infer_dtype = torch.half + elif dtype == "int8": + load_dtype = torch.float32 + infer_dtype = torch.int8 + self.amp_dtype = ( + torch.bfloat16 + if args.quant_with_amp or self._dtype == "bfloat16" + else torch.float32 + ) + model_type = next( + (x for x in MODEL_CLASSES.keys() if x in model_id.lower()), "auto" + ) + model_class = MODEL_CLASSES[model_type] + + self.tokenizer = model_class[1].from_pretrained( + model_id, trust_remote_code=True + ) + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, + ) + + # For now, Falcon, baichuan and gptbigcode have accuracy issue with from_config with deepspeed meta device load. + # TODO: we will change the scope once deepspeed providing the support + if world_size == 1 or model_type in [ + "whisper", + ]: + self.model = model_class[0].from_pretrained( + model_id, + config=self.config, + low_cpu_mem_usage=True, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + else: + with deepspeed.OnDevice(dtype=load_dtype, device="meta"): + if model_class[0] == AutoModelForCausalLM: + self.model = ( + model_class[0] + .from_config(self.config, trust_remote_code=True) + .to(load_dtype) + ) + else: + self.model = model_class[0].from_pretrained( + model_id, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + + self.model = self.model.eval() + + checkpoints_json = "checkpoints.json" + + def print_rank0(*msg): + if local_rank != 0: + return + print(*msg) + + def get_repo_root(model_name_or_path): + if os.path.exists(model_name_or_path): + # local path + # use absolute path here to avoid path error in deepspeed + model_name_or_path = os.path.abspath(model_name_or_path) + return model_name_or_path + # checks if online or not + if is_offline_mode(): + print_rank0("Offline mode: forcing local_files_only=True") + # download only on first process + allow_patterns = ["*.bin", "*.model", "*.json", "*.txt", "*.py", "*LICENSE"] + if local_rank == 0: + snapshot_download( + model_name_or_path, + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None), + allow_patterns=allow_patterns, + # ignore_patterns=["*.safetensors"], + ) + + dist.barrier() + + return snapshot_download( + model_name_or_path, + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None), + allow_patterns=allow_patterns, + # ignore_patterns=["*.safetensors"], + ) + + def get_checkpoint_files(model_name_or_path): + cached_repo_dir = get_repo_root(model_name_or_path) + + # extensions: .bin | .pt + # creates a list of paths from all downloaded files in cache dir + file_list = [ + str(entry) + for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") + if entry.is_file() + ] + return file_list + + def write_checkpoints_json(): + checkpoint_files = get_checkpoint_files(model_id) + if local_rank == 0: + # model.config.model_type.upper() + data = { + "type": "BLOOM", + "checkpoints": checkpoint_files, + "version": 1.0, + } + json.dump(data, open(checkpoints_json, "w")) + + repo_root = get_repo_root(model_id) + write_checkpoints_json() + dist.barrier() + self.model = deepspeed.init_inference( + self.model, + mp_size=tp_number, + base_dir=repo_root, + dtype=infer_dtype, + checkpoint=checkpoints_json, + ) + + self.model = self.model.module + + if self._with_ipex: + ipex_woq_enabled = args.ipex_weight_only_quantization + if ipex_woq_enabled: + from intel_extension_for_pytorch.quantization import WoqWeightDtype + + if args.weight_dtype == "INT8": + weight_dtype = WoqWeightDtype.INT8 + elif args.weight_dtype == "INT4": + weight_dtype = WoqWeightDtype.INT4 + else: + assert args.weight_dtype == "NF4" + weight_dtype = WoqWeightDtype.NF4 + + if args.lowp_mode == "INT8": + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + elif args.lowp_mode == "FP32": + lowp_mode = ipex.quantization.WoqLowpMode.NONE + elif args.lowp_mode == "FP16": + lowp_mode = ipex.quantization.WoqLowpMode.FP16 + elif args.lowp_mode == "BF16": + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + else: # AUTO + if weight_dtype == WoqWeightDtype.INT4: + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + else: + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + + act_quant_mode_dict = { + "PER_TENSOR": ipex.quantization.WoqActQuantMode.PER_TENSOR, + "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, + "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + } + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode_dict[args.act_quant_mode], + group_size=args.group_size, + ) + self.model = ipex.llm.optimize( + self.model.eval(), + dtype=infer_dtype, + quantization_config=qconfig if ipex_woq_enabled else None, + inplace=True, + deployment_mode=False, + ) + + self.base_model = self.model + + self.num_beams = 1 if with_greedy else 4 + self.iter = 0 + + if self._with_jit: + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(self.config.num_hidden_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(self.amp_dtype) + sample_inputs = { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "past_key_values": past_key_values, + "encoder_outputs": (last_hidden_state,), + } + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + if self._dtype != "int8": + traced_model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + traced_model = torch.jit.freeze(traced_model.eval()) + else: + traced_model = torch.jit.load(args.quantized_model_path) + traced_model = torch.jit.freeze(traced_model.eval()) + + traced_model(**sample_inputs) + traced_model(**sample_inputs) + ipex._set_optimized_model_for_generation( + self.model, optimized_model=traced_model + ) + self.dataset = load_dataset("librispeech_asr", split="test.clean") + self.dataloader = DataLoader( + self.dataset, + batch_size=1, + shuffle=False, + ) + + def _levenshtein(self, a: List, b: List) -> int: + """Calculates the Levenshtein distance between a and b.""" + n, m = len(a), len(b) + if n > m: + # Make sure n <= m, to use O(min(n,m)) space + a, b = b, a + n, m = m, n + + current = list(range(n + 1)) + for i in range(1, m + 1): + previous, current = current, [i] + [0] * n + for j in range(1, n + 1): + add, delete = previous[j] + 1, current[j - 1] + 1 + change = previous[j - 1] + if a[j - 1] != b[i - 1]: + change = change + 1 + current[j] = min(add, delete, change) + + return current[n] + + def word_error_rate(self, hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError( + "In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references)) + ) + for h, r in zip(hypotheses, references): + h_list = h.split() + r_list = r.split() + words += len(r_list) + scores += self._levenshtein(h_list, r_list) + if words != 0: + wer = 1.0 * scores / words + else: + wer = float("inf") + return wer, scores, words + + def evaluate(self): + results = [] + references = [] + for batch_ndx, sample in enumerate(self.dataloader): + inputs = sample["audio"]["array"].squeeze(0) + model_inputs = self.tokenizer( + inputs, sampling_rate=16000, return_tensors="pt" + ).input_features + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + output = self.model.generate( + model_inputs, + do_sample=False, + temperature=0.9, + num_beams=self.num_beams, + ) + gen_text = self.tokenizer.batch_decode(output, skip_special_tokens=True) + if len(results) == 0: + results = gen_text + references = sample["text"] + else: + results += gen_text + references += sample["text"] + references = [r.capitalize() for r in references] + wer, scores, words = self.word_error_rate(results, references) + return wer, scores, words + + lm_tasks = [] lmms_tasks = [] +other_tasks = [] lm_all_tasks = lm_eval.tasks.ALL_TASKS try: initialize_tasks() @@ -1657,6 +2029,8 @@ def _collate(x): lm_tasks.append(task) elif task in ALL_TASKS: lmms_tasks.append(task) + elif task in ["librispeech_asr"]: + other_tasks.append(task) else: print(f"Task {task} in not supported by lm_eval and lmms_eval") exit(0) @@ -1730,3 +2104,19 @@ def _collate(x): cli_args=args, ) print(lmms_evaluator.make_table(results)) +elif len(other_tasks) != 0: + if "librispeech_asr" in other_tasks: + evaluator = LibriSpeech( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=True, + with_greedy=False, + ) + wer, scores, num_words = evaluator.evaluate() + print("Evaluation WER: {0}".format(wer)) + print("Accuracy: {:.15f} ".format(1 - wer)) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 4ec30d387..58e428d72 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -18,6 +18,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, TextStreamer, ) @@ -56,6 +57,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -141,6 +143,12 @@ type=str, help="image url for image-to-text task", ) +parser.add_argument( + "--audio", + default="example.flac", + type=str, + help="audio file for speech-to-text task", +) parser.add_argument("--print-memory", action="store_true") parser.add_argument("--token-latency", action="store_true") parser.add_argument( @@ -336,6 +344,8 @@ def get_checkpoint_files(model_name_or_path): config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) +if model_type == "whisper": + config.text_max_length = config.max_source_positions + config.max_target_positions if model_type == "llava": config.use_cache = True @@ -374,6 +384,7 @@ def get_checkpoint_files(model_name_or_path): "git", "qwen", "yuan", + "whisper", ]: model = model_class[0].from_pretrained( model_name, @@ -586,6 +597,13 @@ def load_image(image_file): conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() inputs = [prompt] * args.batch_size +elif model_type == "whisper": + import librosa + + sample = librosa.load(args.audio, sr=16000) + prompt = sample[0] + inputs = [prompt] * args.batch_size + generate_kwargs.pop("min_new_tokens", None) else: # input tokens input_sentences = [] @@ -644,6 +662,9 @@ def generate(): for img in image ] input_tokens = {"input_ids": input_ids, "images": image_tensor} + elif model_type == "whisper": + input_tokens = tokenizer(inputs, sampling_rate=16000, return_tensors="pt") + input_ids = input_tokens.input_features else: input_tokens = tokenizer.batch_encode_plus( inputs, return_token_type_ids=False, return_tensors="pt" @@ -654,15 +675,17 @@ def generate(): input_tokens[t] = input_tokens[t].to( get_accelerator().current_device_name() ) - - outputs = model.generate(**input_tokens, **generate_kwargs) + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if infer_dtype == torch.bfloat16 else False + ): + outputs = model.generate(**input_tokens, **generate_kwargs) gen_ids = outputs[0] if args.token_latency else outputs input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] total_new_tokens = [ - o - i if model.config.model_type != "t5" else o + o if model.config.model_type in ["t5", "whisper"] else o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] gen_text = tokenizer.batch_decode( diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 16448e43f..e29689b30 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -233,7 +233,12 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument( "--image-url", default=None, type=str, help="image url for image-to-text task" ) - + parser.add_argument( + "--audio", + default=None, + type=str, + help="audio file for speech-to-text task", + ) # deepspeed inference related arguments. parser.add_argument("--autotp", action="store_true") parser.add_argument("--shard-model", action="store_true") @@ -293,6 +298,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--config-file", str(args.config_file)]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + infer_cmd.extend(["--audio", str(args.audio)]) print("LLM RUNTIME INFO: running model geneartion...") result = subprocess.run(infer_cmd) @@ -422,6 +429,8 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd.extend(["--greedy"]) if args.image_url is not None: quant_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + quant_cmd.extend(["--audio", str(args.audio)]) if args.ipex_weight_only_quantization: quant_cmd.extend(["--ipex-weight-only-quantization"]) quant_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) @@ -551,6 +560,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--token-latency"]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + infer_cmd.extend(["--audio", str(args.audio)]) if args.prompt is not None: infer_cmd.extend(["--prompt", str(args.prompt)]) @@ -594,6 +605,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "yuan": ("/yuan_local_shard"), "phi-3": ("/phi-3_local_shard"), "phi": ("/phi_local_shard"), + "whisper": ("/whisper_local_shard"), } model_type = next( ( @@ -657,6 +669,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--token-latency"]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + infer_cmd.extend(["--audio", str(args.audio)]) if args.prompt is not None: infer_cmd.extend(["--prompt", str(args.prompt)]) diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 5dbf4f249..2cdc8d563 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -7,6 +7,8 @@ import math import torch.nn.functional as F import re +from datasets import load_dataset +from torch.utils.data import DataLoader sys.path.append(sys.path[0] + "/../../") from transformers import ( @@ -14,6 +16,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) @@ -38,6 +41,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -100,6 +104,14 @@ import transformers try: + import lmms_eval + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + from lmms_eval import evaluator as lmms_evaluator + from lmms_eval import utils as lmms_utils + from lmms_eval.api.registry import ALL_TASKS + from lmms_eval.tasks import initialize_tasks from llava.model.language_model.llava_llama import ( # noqa F401 LlavaLlamaForCausalLM, ) @@ -116,14 +128,6 @@ DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, ) - import lmms_eval - from lmms_eval.api.instance import Instance - from lmms_eval.api.model import lmms - from lmms_eval.api.registry import register_model - from lmms_eval import evaluator as lmms_evaluator - from lmms_eval import utils as lmms_utils - from lmms_eval.api.registry import ALL_TASKS - from lmms_eval.tasks import initialize_tasks except ImportError: def register_model(name): @@ -1338,8 +1342,266 @@ def _collate(x): return res +class LibriSpeech: + def __init__( + self, + pretrained: str, + device: Optional[str] = "cpu", + with_ipex=True, + with_jit=True, + with_greedy=False, + batch_size=1, + dtype: Optional[Union[str, torch.dtype]] = "auto", + config=None, + add_special_tokens=True, + ) -> None: + model_id = pretrained + self.device = torch.device(device) + self.batch_size = int(batch_size) + self.with_jit = with_jit + self.with_ipex = with_ipex + self.with_greedy = with_greedy + self.dtype = dtype + self.add_special_tokens = add_special_tokens + load_dtype = torch.float32 + infer_dtype = torch.float32 + if dtype == "float16": + load_dtype = torch.half + infer_dtype = torch.half + elif dtype == "bfloat16": + load_dtype = torch.bfloat16 + infer_dtype = torch.bfloat16 + elif dtype in ["int8", "int4", "nf4"]: + load_dtype = torch.float32 + infer_dtype = torch.int8 + self.amp_dtype = ( + torch.bfloat16 + if args.quant_with_amp or self.dtype == "bfloat16" + else torch.float32 + ) + + model_type = next( + (x for x in MODEL_CLASSES.keys() if x in model_id.lower()), "auto" + ) + model_class = MODEL_CLASSES[model_type] + self.tokenizer = model_class[1].from_pretrained( + model_id, trust_remote_code=True + ) + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, + ) + self.config.torchscript = self.with_jit + if self.dtype in ("int8", "int4", "nf4"): + try: + with ipex.OnDevice(dtype=torch.float, device="meta"): + self.model = model_class[0].from_config( + self.config, trust_remote_code=True + ) + except (RuntimeError, AttributeError) as e: + print("Warning: Loading model to meta device failed:", e) + self.model = model_class[0].from_pretrained( + model_id, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + else: + self.model = model_class[0].from_pretrained( + model_id, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + + self.model = self.model.eval() + if with_ipex and dtype not in ["int8", "int4", "nf4"]: + self.model = ipex.llm.optimize( + self.model.eval(), + dtype=infer_dtype, + inplace=True, + deployment_mode=False, + ) + + if args.torch_compile: + if dtype in ["int8", "int4", "nf4"]: + raise SystemExit( + "[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype," + " please set dtype to float32 or bfloat16 if want to use torch.compile." + ) + if with_jit: + raise SystemExit( + "[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use" + " torch.compile." + ) + self.model.forward = torch.compile( + self.model.forward, dynamic=True, backend=args.backend + ) + + self.base_model = self.model + + self.iter = 0 + self.num_beams = 1 if with_greedy else 4 + self.tp_number = 1 + if self.with_jit: + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(self.config.num_hidden_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(self.amp_dtype) + sample_inputs = { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "past_key_values": past_key_values, + "encoder_outputs": (last_hidden_state,), + } + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + if self.dtype != "int8": + traced_model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + traced_model = torch.jit.freeze(traced_model.eval()) + else: + traced_model = torch.jit.load(args.quantized_model_path) + traced_model = torch.jit.freeze(traced_model.eval()) + + traced_model(**sample_inputs) + traced_model(**sample_inputs) + ipex._set_optimized_model_for_generation( + self.model, optimized_model=traced_model + ) + self.dataset = load_dataset("librispeech_asr", split="test.clean") + self.dataloader = DataLoader( + self.dataset, + batch_size=1, + shuffle=False, + ) + + def _levenshtein(self, a: List, b: List) -> int: + """Calculates the Levenshtein distance between a and b.""" + n, m = len(a), len(b) + if n > m: + # Make sure n <= m, to use O(min(n,m)) space + a, b = b, a + n, m = m, n + + current = list(range(n + 1)) + for i in range(1, m + 1): + previous, current = current, [i] + [0] * n + for j in range(1, n + 1): + add, delete = previous[j] + 1, current[j - 1] + 1 + change = previous[j - 1] + if a[j - 1] != b[i - 1]: + change = change + 1 + current[j] = min(add, delete, change) + + return current[n] + + def word_error_rate(self, hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError( + "In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references)) + ) + for h, r in zip(hypotheses, references): + h_list = h.split() + r_list = r.split() + words += len(r_list) + scores += self._levenshtein(h_list, r_list) + if words != 0: + wer = 1.0 * scores / words + else: + wer = float("inf") + return wer, scores, words + + def evaluate(self): + results = [] + references = [] + for batch_ndx, sample in enumerate(self.dataloader): + inputs = sample["audio"]["array"].squeeze(0) + model_inputs = self.tokenizer( + inputs, sampling_rate=16000, return_tensors="pt" + ).input_features + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + output = self.model.generate( + model_inputs, + do_sample=False, + temperature=0.9, + num_beams=self.num_beams, + ) + gen_text = self.tokenizer.batch_decode(output, skip_special_tokens=True) + if len(results) == 0: + results = gen_text + references = sample["text"] + else: + results += gen_text + references += sample["text"] + references = [r.capitalize() for r in references] + wer, scores, words = self.word_error_rate(results, references) + return wer, scores, words + + lm_tasks = [] lmms_tasks = [] +other_tasks = [] lm_all_tasks = lm_eval.tasks.ALL_TASKS try: initialize_tasks() @@ -1350,6 +1612,8 @@ def _collate(x): lm_tasks.append(task) elif task in ALL_TASKS: lmms_tasks.append(task) + elif task in ["librispeech_asr"]: + other_tasks.append(task) else: print(f"Task {task} in not supported by lm_eval and lmms_eval") exit(0) @@ -1420,3 +1684,19 @@ def _collate(x): cli_args=args, ) print(lmms_evaluator.make_table(results)) +elif len(other_tasks) != 0: + if "librispeech_asr" in other_tasks: + evaluator = LibriSpeech( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=True, + with_greedy=False, + ) + wer, scores, num_words = evaluator.evaluate() + print("Evaluation WER: {0}".format(wer)) + print("Accuracy: {:.15f} ".format(1 - wer)) diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 1c4c04a0f..a747adabe 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -10,6 +10,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) @@ -46,6 +47,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -104,6 +106,12 @@ type=str, help="image url for image-to-text task", ) +parser.add_argument( + "--audio", + default="example.flac", + type=str, + help="audio file for speech-to-text task", +) parser.add_argument( "--config-file", default=None, type=str, help="specific configuration file" ) @@ -170,6 +178,8 @@ config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) +if model_type == "whisper": + config.text_max_length = config.max_source_positions + config.max_target_positions if not hasattr(config, "lm_head_generation"): config.lm_head_generation = True @@ -246,6 +256,10 @@ def load_image(image_file): roles = conv.roles if re.search("yuan", model.config.architectures[0], re.IGNORECASE): model.config.batch_size = int(args.batch_size) * num_beams +if re.search("whisper", model.config.architectures[0], re.IGNORECASE): + import librosa + + sample = librosa.load(args.audio, sr=16000) def trace_handler(prof): @@ -297,6 +311,9 @@ def trace_handler(prof): conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() + elif model_type == "whisper": + prompt = sample[0] + generate_kwargs.pop("min_new_tokens", None) else: # input prompt current_path = pathlib.Path(__file__).parent.resolve() @@ -351,6 +368,11 @@ def trace_handler(prof): elif model_type == "git": input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = model.generate(pixel_values=input_ids, **generate_kwargs) + elif model_type == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) @@ -364,7 +386,7 @@ def trace_handler(prof): input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] total_new_tokens = [ - o - i if model.config.model_type != "t5" else o + o if model.config.model_type in ["t5", "whisper"] else o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] print(gen_text, total_new_tokens, flush=True) @@ -409,6 +431,11 @@ def trace_handler(prof): output = model.generate( pixel_values=input_ids, **generate_kwargs ) + elif model_type == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 07446a654..c31160757 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -38,6 +38,7 @@ from llm.utils.model_class.phi import PhiConfig from llm.utils.model_class.phi import Phi3Config from llm.utils.model_class.yuan import YuanConfig +from llm.utils.model_class.whisper import WhisperConfig parser = argparse.ArgumentParser("LLM generation script (int8 path)", add_help=False) parser.add_argument( @@ -72,6 +73,12 @@ type=str, help="image url for image-to-text task", ) +parser.add_argument( + "--audio", + default="example.flac", + type=str, + help="audio file for speech-to-text task", +) parser.add_argument( "--qconfig-summary-file", default="", help="qconfig for static quantization" ) @@ -349,6 +356,10 @@ def load_image(image_file): model = PhiConfig(args.model_id) elif re.search("yuan", config.architectures[0], re.IGNORECASE): model = YuanConfig(args.model_id) +elif re.search("whisper", config.architectures[0], re.IGNORECASE): + import librosa + + model = WhisperConfig(args.model_id) else: raise AssertionError("Not support %s." % (args.model_id)) @@ -359,6 +370,8 @@ def load_image(image_file): config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) if model.name in ["git", "llava"]: config.batch_size = int(args.batch_size) * num_beams +if model.name == "whisper": + config.text_max_length = config.max_source_positions + config.max_target_positions user_model = model.get_user_model(config, args.benchmark) @@ -529,6 +542,32 @@ def get_example_inputs(model): torch.ones((batch_size, 1), dtype=torch.long), tuple(past_key_value), ) + elif model.example_inputs_mode == EXAMPLE_INPUTS_MODE.KV_ENC: + past_key_value = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [1, 32, n_heads, head_dim], dtype=amp_dtype + ).contiguous(), + torch.zeros( + [1, 32, n_heads, head_dim], dtype=amp_dtype + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(n_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(amp_dtype) + example_inputs = ( + torch.ones(4).to(torch.long).unsqueeze(0), + past_key_value, + (last_hidden_state,), + ) else: raise RuntimeError( "Your model does not match existing example inputs used in ipex quantization, exiting..." @@ -573,6 +612,12 @@ def __init__( def tokenize_function(self, examples): if "prompt" in examples: example = self.tokenizer(examples["prompt"]) + elif "audio" in examples: + inputs = [d["array"] for d in examples["audio"]] + example = self.tokenizer( + inputs, sampling_rate=16000, return_tensors="pt" + ) + example["input_ids"] = example["input_features"] elif "text" in examples: example = self.tokenizer(examples["text"]) elif "code" in examples: @@ -689,6 +734,66 @@ def collate_batch(self, batch): tuple(global_past_key_value), (last_hidden_state,), ) + elif model.example_inputs_mode == EXAMPLE_INPUTS_MODE.KV_ENC: + input_bs = int(args.batch_size * num_beams) + model_kwargs = {} + model_kwargs = ( + user_model._prepare_encoder_decoder_kwargs_for_generation( + torch.vstack(input_ids_padded).unsqueeze(0), + model_kwargs, + "input_features", + ) + ) + last_hidden_state = model_kwargs["encoder_outputs"][ + "last_hidden_state" + ] + global_past_key_value = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + user_model.model.decoder.layers[i] + .encoder_attn.k_proj(last_hidden_state) + .view( + int(input_bs), + -1, + user_model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + user_model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ) + .contiguous(), + user_model.model.decoder.layers[i] + .encoder_attn.v_proj(last_hidden_state) + .view( + int(input_bs), + -1, + user_model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + user_model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(n_layers) + ] + ) + decoder_input_ids = ( + torch.zeros(input_bs).to(torch.long).unsqueeze(1) + ) + model_inputs = ( + decoder_input_ids, + tuple(global_past_key_value), + (last_hidden_state,), + ) else: raise RuntimeError( "Your model does not match existing example inputs used in ipex smooth quant, exiting..." @@ -699,9 +804,12 @@ def collate_batch(self, batch): return (model_inputs, last_ind) - calib_dataset = load_dataset( - args.dataset if args.dataset else model.default_dataset, split="train" - ) + if model.default_dataset == "librispeech_asr": + calib_dataset = load_dataset(model.default_dataset, split="train.clean.100") + else: + calib_dataset = load_dataset( + args.dataset if args.dataset else model.default_dataset, split="train" + ) if args.calib_shuffle: calib_dataset = calib_dataset.shuffle(seed=42) user_model.eval() @@ -975,6 +1083,10 @@ def calib_func(prepared_model): conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() image_processor = model.get_image_processor() + elif model.name == "whisper": + sample = librosa.load(args.audio, sr=16000) + prompt = sample[0] + generate_kwargs.pop("min_new_tokens", None) else: # input prompt current_path = pathlib.Path(__file__).parent.resolve() @@ -1028,6 +1140,11 @@ def calib_func(prepared_model): elif model.name == "git": input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = user_model.generate(pixel_values=input_ids, **generate_kwargs) + elif model.name == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = user_model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = user_model.generate(input_ids, **generate_kwargs) @@ -1040,7 +1157,7 @@ def calib_func(prepared_model): input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] total_new_tokens = [ - o - i if user_model.config.model_type != "t5" else o + o if user_model.config.model_type in ["t5", "whisper"] else o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] print(gen_text, total_new_tokens, flush=True) @@ -1061,7 +1178,7 @@ def trace_handler(prof): activities=[torch.profiler.ProfilerActivity.CPU], schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), on_trace_ready=trace_handler, - ) as prof: + ) as prof, torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): for i in range(5): if model.name == "llava": input_ids = torch.stack( @@ -1088,6 +1205,11 @@ def trace_handler(prof): output = user_model.generate( pixel_values=input_ids, **generate_kwargs ) + elif model.name == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = user_model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = user_model.generate(input_ids, **generate_kwargs) diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index fcd10d89f..ee2a4b386 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -5,6 +5,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) @@ -33,6 +34,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/utils/model_class/llm.py b/examples/cpu/inference/python/llm/utils/model_class/llm.py index 6a44ec8eb..4ef26b7fa 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/llm.py +++ b/examples/cpu/inference/python/llm/utils/model_class/llm.py @@ -15,6 +15,7 @@ class EXAMPLE_INPUTS_MODE(IntEnum): MASK_KV_ENC = 5 MASK_KV_PIXEL = 6 EMBEDS_MASK_KV = 7 + KV_ENC = 8 class LLMConfig(ABC): diff --git a/examples/cpu/inference/python/llm/utils/model_class/whisper.py b/examples/cpu/inference/python/llm/utils/model_class/whisper.py new file mode 100644 index 000000000..da03c7ffe --- /dev/null +++ b/examples/cpu/inference/python/llm/utils/model_class/whisper.py @@ -0,0 +1,49 @@ +import torch +from .llm import LLMConfig, EXAMPLE_INPUTS_MODE +from transformers import WhisperForConditionalGeneration, AutoProcessor +import intel_extension_for_pytorch as ipex + + +class WhisperConfig(LLMConfig): + def __init__(self, model_id): + self.name = "whisper" + self.model_id = model_id + self.to_channels_last = True + self.example_inputs_mode = EXAMPLE_INPUTS_MODE.KV_ENC + + # for smooth quant + self.default_dataset = "librispeech_asr" + self.use_global_past_key_value = False + self.use_ipex_autotune = True + + def get_user_model(self, config, benchmark): + if benchmark: + try: + with ipex.OnDevice(dtype=torch.float, device="meta"): + self.model = WhisperForConditionalGeneration.from_pretrained( + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + except (RuntimeError, AttributeError): + self.model = WhisperForConditionalGeneration.from_pretrained( + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + else: + self.model = WhisperForConditionalGeneration.from_pretrained( + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + return self.model + + def get_tokenizer(self): + return AutoProcessor.from_pretrained(self.model_id) diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py index a97fe98fc..713294dd9 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py @@ -191,6 +191,7 @@ def _beam_sample( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False if model_inputs["past_key_values"] is None: @@ -271,6 +272,46 @@ def _beam_sample( for i in range(self.config.num_hidden_layers) ] ) + elif self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) if first_token: if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -308,6 +349,12 @@ def _beam_sample( model_inputs["encoder_outputs"] = ( model_inputs["encoder_outputs"]["last_hidden_state"], ) + if self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if self.model_backbone == "LlavaLlamaForCausalLM" and hasattr( self, "prepare_inputs_labels_for_multimodal" ): diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index 9ce5b276e..e36e329a0 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -193,6 +193,7 @@ def _beam_search( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False has_position_id = model_inputs.get("position_ids", None) is not None @@ -274,6 +275,46 @@ def _beam_search( for i in range(self.config.num_hidden_layers) ] ) + elif self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) if first_token and self.model_backbone != "YuanForCausalLM": if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -328,6 +369,12 @@ def _beam_search( model_inputs["encoder_outputs"] = ( model_inputs["encoder_outputs"]["last_hidden_state"], ) + if self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if self.model_backbone == "LlavaLlamaForCausalLM" and hasattr( self, "prepare_inputs_labels_for_multimodal" ): diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index 85d8f2ab6..b874668bb 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -174,6 +174,7 @@ def _greedy_search( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False input_bs = input_ids.size()[0] @@ -229,6 +230,47 @@ def _greedy_search( for i in range(self.config.num_hidden_layers) ] ) + if self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + if first_token: if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -279,6 +321,12 @@ def _greedy_search( self, "prepare_inputs_labels_for_multimodal" ): model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) + elif self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if first_token and self.model_backbone == "YuanForCausalLM": model_inputs.pop("past_key_values", None) if hasattr(self, "trace_graph"): diff --git a/intel_extension_for_pytorch/transformers/generation/sample.py b/intel_extension_for_pytorch/transformers/generation/sample.py index f446b412c..22f6bc5d0 100644 --- a/intel_extension_for_pytorch/transformers/generation/sample.py +++ b/intel_extension_for_pytorch/transformers/generation/sample.py @@ -180,6 +180,7 @@ def _sample( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False input_bs = input_ids.size()[0] @@ -235,6 +236,47 @@ def _sample( for i in range(self.config.num_hidden_layers) ] ) + if self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + if first_token: if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -287,6 +329,12 @@ def _sample( model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) if first_token and self.model_backbone == "YuanForCausalLM": model_inputs.pop("past_key_values", None) + if self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if hasattr(self, "trace_graph"): model_inputs.pop("use_cache", None) model_inputs.pop("token_type_ids", None) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index 29491dfe0..cb45433b9 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -26,6 +26,7 @@ def __init__(self, module, config, tpp=False, woq=False): "T5ForConditionalGeneration", "MptForCausalLM", "GitForCausalLM", + "WhisperForConditionalGeneration", ] or self.model_backbone == "BaichuanForCausalLM" and hasattr(module, "rotary_emb") diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index 7d578cffb..5ed1a85d2 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -214,5 +214,23 @@ def __init__(self, module, config, tpp=False, woq=False): self.mha_linear_add = _IPEXlinearAddCPU( module.mha_linear_add.linear, tpp=tpp, woq=woq ) + elif self.model_backbone == "WhisperForConditionalGeneration": + if not self.distributed: + if hasattr(module, "mha_linear_add"): + self.mha_linear_add = _IPEXlinearAddCPU( + module.mha_linear_add.linear, tpp=tpp, woq=woq + ) + if hasattr(module, "mlp_linear_add"): + self.mlp_linear_add = _IPEXlinearAddCPU( + module.mlp_linear_add.linear, tpp=tpp, woq=woq + ) + if hasattr(module, "encoder_mha_linear_add"): + self.encoder_mha_linear_add = _IPEXlinearAddCPU( + module.encoder_mha_linear_add.linear, tpp=tpp, woq=woq + ) + if hasattr(module, "linear_gelu"): + self.linear_gelu = _IPEXlinearGeluCPU( + module.linear_gelu.linear, tpp=tpp, woq=woq + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 4fb722d6a..c8b78fc34 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -7,9 +7,10 @@ CausalLMOutputWithCrossAttentions, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput, + Seq2SeqModelOutput, BaseModelOutput, ) - +import numpy as np from ....utils._logger import logger, WarningType import transformers @@ -26,6 +27,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) + from transformers.generation.configuration_utils import GenerationConfig except ImportError: pass @@ -3345,6 +3347,203 @@ def Phi3Model_forward( ) +def WhisperModel_forward( + self, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, + decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if encoder_outputs is None: + input_features = self._mask_input_features( + input_features, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + input_features, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + position_ids=decoder_position_ids, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + tuple(encoder_outputs) + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +def WhisperForConditionalGeneration_forward( + self, + decoder_input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, + decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]: + if labels is not None: + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_features, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + decoder_inputs_embeds=decoder_inputs_embeds, + decoder_position_ids=decoder_position_ids, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=False, + ) + + sequence_output = outputs[0] + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and sequence_output.size(1) != 1 + ): + sequence_output = sequence_output[:, -1:, :] + lm_logits = self.proj_out(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # move labels to correct device to enable PP + labels = labels.to(lm_logits.device) + loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1)) + + output = (lm_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + +def detect_language( + self, + input_features: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None, + generation_config: Optional[GenerationConfig] = None, + num_segment_frames: int = 3000, +) -> torch.Tensor: + if input_features is None and encoder_outputs is None: + raise ValueError( + "You have to specify either `input_features` or `encoder_outputs`" + ) + elif input_features is not None and encoder_outputs is not None: + raise ValueError( + "Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!" + ) + elif input_features is not None: + inputs = {"input_features": input_features[:, :, :num_segment_frames]} + batch_size = input_features.shape[0] + elif encoder_outputs is not None: + inputs = {"encoder_outputs": encoder_outputs} + batch_size = ( + encoder_outputs[0].shape[0] + if isinstance(encoder_outputs, BaseModelOutput) + else encoder_outputs[0] + ) + + generation_config = generation_config or self.generation_config + decoder_input_ids = ( + torch.ones((batch_size, 1), device=self.device, dtype=torch.long) + * generation_config.decoder_start_token_id + ) + + with torch.no_grad(): + outputs = self(**inputs, decoder_input_ids=decoder_input_ids) + if isinstance(outputs, tuple): + logits = outputs[0][:, -1] + else: + logits = outputs.logits[:, -1] + + non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool) + non_lang_mask[list(generation_config.lang_to_id.values())] = False + + logits[:, non_lang_mask] = -np.inf + + lang_ids = logits.argmax(-1) + + return lang_ids + + def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): if module.config.use_return_dict or ( "return_dict" in kwargs and kwargs["return_dict"] @@ -3398,7 +3597,10 @@ def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): ) or module.config.output_attentions: encoder_attentions = outputs[idx] idx += 1 - if module.config.architectures[0] == "T5ForConditionalGeneration": + if module.config.architectures[0] in [ + "T5ForConditionalGeneration", + "WhisperForConditionalGeneration", + ]: return Seq2SeqLMOutput( loss=loss, logits=logits, diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 2bbd4bd50..85e0f47ae 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -2022,6 +2022,101 @@ def _Phi3Attention_forward( return attn_output, attn_weights, past_key_value +def _WhisperAttention_forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + is_cross_attention = key_value_states is not None + bsz, tgt_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) * self.scaling + if is_cross_attention and past_key_value is not None: + key_states = past_key_value[1].contiguous() + value_states = past_key_value[2].contiguous() + elif is_cross_attention: + key_states = ( + self.k_proj(key_value_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + value_states = ( + self.v_proj(key_value_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + else: + key_states = ( + self.k_proj(hidden_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + value_states = ( + self.v_proj(hidden_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + + query_states = query_states.view( + bsz, -1, self.num_heads, self.head_dim + ).contiguous() + + src_len = key_states.size(1) + if attention_mask is None: + seq_len = ( + src_len + past_key_value[0].size(-2) + if past_key_value is not None + else src_len + ) + attention_mask = torch.zeros( + [bsz, 1, tgt_len, seq_len], dtype=hidden_states.dtype + ) + if key_value_states is None and self.is_decoder: + decoded_tokens = ( + torch.tensor(past_key_value[0].size(-2)) + if past_key_value is not None + else None + ) + else: + decoded_tokens = torch.zeros(1, dtype=torch.long).contiguous()[0] + + ( + attn_output, + attn_weights, + past_key_value, + ) = self._IPEXScaleDotProduct( + query_states, + key_states, + value_states, + 1, + past_key_value, + layer_head_mask, + attention_mask, + None, + False, + decoded_tokens, + ) + if is_cross_attention: + past_key_value = ( + past_key_value[0], + key_states, + value_states, + past_key_value[3], + ) + if not output_attentions: + attn_weights = None + if not self.is_decoder: + past_key_value = None + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, -1) + # attn_output = self.out_proj(attn_output) + return attn_output, attn_weights, past_key_value + + def _create_attention_mask_for_git( self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None ): @@ -2151,6 +2246,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): "T5ForConditionalGeneration", "MptForCausalLM", "GitForCausalLM", + "WhisperForConditionalGeneration", ] or ( self.model_backbone == "BaichuanForCausalLM" @@ -2648,6 +2744,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "WhisperForConditionalGeneration": + return _WhisperAttention_forward( + self, + hidden_states, + key_value_states, + past_key_value, + attention_mask, + layer_head_mask, + output_attentions, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 9e3a29c6e..fb56ca61b 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -1273,6 +1273,128 @@ def Phi3DecoderLayer_forward( return outputs +def WhisperEncoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, +) -> torch.Tensor: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.out_proj(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.linear_gelu(hidden_states) + if not self.distributed: + hidden_states = self.mlp_linear_add(hidden_states, residual) + else: + hidden_states = self.fc2(hidden_states) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +def WhisperDecoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, +) -> torch.Tensor: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + self_attn_past_key_value = ( + past_key_value[:4] if past_key_value is not None else None + ) + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.out_proj(hidden_states) + hidden_states = residual + hidden_states + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + cross_attn_past_key_value = ( + past_key_value[4:] if past_key_value is not None else None + ) + ( + hidden_states, + cross_attn_weights, + cross_attn_present_key_value, + ) = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + if not self.distributed: + hidden_states = self.encoder_mha_linear_add(hidden_states, residual) + else: + hidden_states = self.encoder_attn.out_proj(hidden_states) + hidden_states = residual + hidden_states + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + hidden_states = self.linear_gelu(hidden_states) + if not self.distributed: + hidden_states = self.mlp_linear_add(hidden_states, residual) + else: + hidden_states = self.fc2(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + class _IPEXDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): super().__init__() @@ -1470,6 +1592,19 @@ def __init__(self, module, config, distributed=False): del self.__dict__["_modules"]["mlp"].down_proj self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.o_proj) del self.__dict__["_modules"]["self_attn"].o_proj + elif self.model_backbone == "WhisperForConditionalGeneration": + if not self.distributed: + self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.out_proj) + del self.__dict__["_modules"]["self_attn"].out_proj + self.mlp_linear_add = _IPEXlinearAddRef(module.fc2) + del self.__dict__["_modules"]["fc2"] + if hasattr(module, "encoder_attn"): + self.encoder_mha_linear_add = _IPEXlinearAddRef( + module.encoder_attn.out_proj + ) + del self.__dict__["_modules"]["encoder_attn"].out_proj + self.linear_gelu = _IPEXlinearGeluRef(module.fc1) + del self.__dict__["_modules"]["fc1"] else: AssertionError(False, "Do not support the optimization of your model yet") @@ -1710,5 +1845,22 @@ def forward( use_cache, past_key_value, ) + elif self.model_backbone == "WhisperForConditionalGeneration": + if encoder_hidden_states is not None: + return WhisperDecoderLayer_forward( + self, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + layer_head_mask, + cross_attn_layer_head_mask, + past_key_value, + output_attentions, + use_cache, + ) + return WhisperEncoderLayer_forward( + self, hidden_states, attention_mask, layer_head_mask, output_attentions + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 661daae5a..bbd8d4bd1 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -187,10 +187,13 @@ def model_convert_reference(_model): PhiForCausalLM_forward, PhiModel_forward, Phi3Model_forward, + WhisperForConditionalGeneration_forward, + WhisperModel_forward, prepare_inputs_for_generation, prepare_inputs_for_generation_gptbigcode, prepare_inputs_for_generation_llama, prepare_inputs_labels_for_multimodal_llavallama, + detect_language, ) if not hasattr(_model.config, "architectures"): @@ -786,6 +789,31 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) + elif _model.config.architectures[0] == "WhisperForConditionalGeneration": + convert_function(_model, "detect_language", detect_language) + convert_function(_model, "forward", WhisperForConditionalGeneration_forward) + convert_function(_model.model, "forward", WhisperModel_forward) + convert_class( + _model, + type(_model.model.encoder.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.decoder.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.encoder.layers[0].self_attn), + _IPEXAttentionRef, + _model.config, + distributed=distributed, + ) return _model @@ -888,6 +916,60 @@ def get_dummy_input(_model, return_dict=False): (last_hidden_state,), ) ) + elif _model.config.architectures[0] == "WhisperForConditionalGeneration": + dtype = ( + _model.model.decoder.layers[0].mha_linear_add.dtype + if hasattr(_model.model.decoder.layers[0], "mha_linear_add") + else _model.dtype + ) + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + 32, + _model.model.decoder.layers[i].encoder_attn.num_heads, + _model.model.decoder.layers[i].encoder_attn.head_dim, + ], + dtype=dtype, + ).contiguous(), + torch.zeros( + [ + 1, + 32, + _model.model.decoder.layers[i].encoder_attn.num_heads, + _model.model.decoder.layers[i].encoder_attn.head_dim, + ], + dtype=dtype, + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(model_num_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(dtype) + sample_inputs = ( + ( + { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "past_key_values": past_key_values, + "encoder_outputs": (last_hidden_state,), + } + ) + if return_dict + else ( + torch.ones(1).to(torch.long).unsqueeze(0), + past_key_values, + (last_hidden_state,), + ) + ) + else: sample_inputs = ( { @@ -1106,6 +1188,13 @@ def model_convert_lowering( getattr(_model, model_name), "_use_sdpa" ): getattr(_model, model_name)._use_sdpa = False + if hasattr(_model, model_name): + cur_mod = getattr(_model, model_name) + for submodel_name in ["encoder", "decoder"]: + if hasattr(cur_mod, submodel_name) and hasattr( + getattr(cur_mod, submodel_name), "_use_sdpa" + ): + getattr(cur_mod, submodel_name)._use_sdpa = False for supported_mlp_class in [_IPEXDecoderLayerRef]: lowering_class_cpu( @@ -1217,7 +1306,7 @@ def optimize( Well supported model family with full functionalities: Llama, GPT-J, GPT-Neox, OPT, Falcon, Bloom, CodeGen, Baichuan, ChatGLM, GPTBigCode, - T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan, Phi. + T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan, Phi, Whisper. For the model that is not in the scope of supported model family above, will try to apply default ipex.optimize transparently to get benifits (not include quantizations, @@ -1307,6 +1396,7 @@ def optimize( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ] if well_supported_model: @@ -1315,8 +1405,8 @@ def optimize( if quantization_config is not None: logger.warning( "ipex.llm.optimize supports quantizations on Llama, GPT-J, GPT-Neox, Falcon, OPT, Bloom, CodeGen," - + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, Yuan, " - + "and Phi, fallback to origin model" + + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, Yuan," + + " Phi, and Whisper, fallback to origin model" ) return model diff --git a/tests/cpu/hf_configs/whisper/config.json b/tests/cpu/hf_configs/whisper/config.json new file mode 100644 index 000000000..b1d5ff58e --- /dev/null +++ b/tests/cpu/hf_configs/whisper/config.json @@ -0,0 +1,144 @@ +{ + "_name_or_path": "openai/whisper-large-v2", + "activation_dropout": 0.0, + "activation_function": "gelu", + "architectures": [ + "WhisperForConditionalGeneration" + ], + "attention_dropout": 0.0, + "begin_suppress_tokens": [ + 220, + 50257 + ], + "bos_token_id": 50257, + "d_model": 1280, + "decoder_attention_heads": 20, + "decoder_ffn_dim": 5120, + "decoder_layerdrop": 0.0, + "decoder_layers": 1, + "decoder_start_token_id": 50258, + "dropout": 0.0, + "encoder_attention_heads": 20, + "encoder_ffn_dim": 5120, + "encoder_layerdrop": 0.0, + "encoder_layers": 1, + "eos_token_id": 50257, + "forced_decoder_ids": [ + [ + 1, + 50259 + ], + [ + 2, + 50359 + ], + [ + 3, + 50363 + ] + ], + "init_std": 0.02, + "is_encoder_decoder": true, + "max_length": 448, + "max_source_positions": 1500, + "max_target_positions": 448, + "model_type": "whisper", + "num_hidden_layers": 1, + "num_mel_bins": 80, + "pad_token_id": 50257, + "scale_embedding": false, + "suppress_tokens": [ + 1, + 2, + 7, + 8, + 9, + 10, + 14, + 25, + 26, + 27, + 28, + 29, + 31, + 58, + 59, + 60, + 61, + 62, + 63, + 90, + 91, + 92, + 93, + 359, + 503, + 522, + 542, + 873, + 893, + 902, + 918, + 922, + 931, + 1350, + 1853, + 1982, + 2460, + 2627, + 3246, + 3253, + 3268, + 3536, + 3846, + 3961, + 4183, + 4667, + 6585, + 6647, + 7273, + 9061, + 9383, + 10428, + 10929, + 11938, + 12033, + 12331, + 12562, + 13793, + 14157, + 14635, + 15265, + 15618, + 16553, + 16604, + 18362, + 18956, + 20075, + 21675, + 22520, + 26130, + 26161, + 26435, + 28279, + 29464, + 31650, + 32302, + 32470, + 36865, + 42863, + 47425, + 49870, + 50254, + 50258, + 50358, + 50359, + 50360, + 50361, + 50362 + ], + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "use_cache": true, + "vocab_size": 51865 +} \ No newline at end of file diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 0f697d4be..0b34ba776 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -172,6 +172,13 @@ lambda m: m.model.layers[0].self_attn.__class__, lambda m: m.model.layers[0].__class__, ), + model_info( + "whisper", + transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration, + False, + lambda m: m.model.decoder.layers[0].self_attn.__class__, + lambda m: m.model.decoder.layers[0].__class__, + ), ] @@ -247,6 +254,12 @@ def model_replacement_check( input_dict["decoder_input_ids"] = decoder_input_ids.unsqueeze(0) if m.name == "git": input_dict["pixel_values"] = torch.zeros(1, 3, 224, 224) + if m.name == "whisper": + last_hidden_state = torch.rand([1, 32, 1280]) + input_dict = { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "encoder_outputs": (last_hidden_state,), + } with torch.no_grad(): key_hf = ref_m(**input_dict) From 57f381b16fb5ba79ec829619488974dab211329d Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 23 May 2024 19:07:45 +0800 Subject: [PATCH 17/21] update oneDNN to 74fb846b88 on main (#2924) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 0430e47c6..7d2635dc9 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 0430e47c6b2704627977b99ab5556aa0ba6908ce +Subproject commit 7d2635dc94a53637287c4c144ff0618f3472e2c1 From 798da3b7de4095616414e60d23c62d1a493ef24c Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Fri, 24 May 2024 13:17:38 +0800 Subject: [PATCH 18/21] fix rope for BS > 1 (#2912) * fix rope for BS > 1 * fix ut --- csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp | 10 ++++++++-- tests/cpu/test_rope.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp index ba044e2e6..0894ef23e 100644 --- a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp +++ b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp @@ -86,7 +86,12 @@ std::tuple ApplyROPEKernel( auto out_stride_kb = concat_qkv ? key.stride(0) : 0; auto out_stride_ks = concat_qkv ? key.stride(1) : 0; auto emb_pos_ptr = t_emb_pos.data_ptr(); // [MP][HR] - auto pos_ptr = t_pos.data_ptr(); // [MB][S] + auto pos_ptr = t_pos.data_ptr(); // [B][S] or [1][S] + bool t_pos_no_repeated_for_batch = false; + if (t_pos.numel() != 1 && t_pos.size(0) == 1 && B > 1) { + // we do not perform t_pos.repeat here to avoid the overhead of copying + t_pos_no_repeated_for_batch = true; + } { #pragma omp parallel for collapse(3) for (int b = 0; b < B; b++) { @@ -106,7 +111,8 @@ std::tuple ApplyROPEKernel( sin_start = emb_pos_ptr + (p + s) * HR; cos_start = emb_pos_ptr + (p + s) * HR + COFF; } else { - p = pos_ptr[b * S + s]; + auto start_idx = t_pos_no_repeated_for_batch ? 0 : b * S; + p = pos_ptr[start_idx + s]; sin_start = emb_pos_ptr + p * HR; cos_start = emb_pos_ptr + p * HR + COFF; } diff --git a/tests/cpu/test_rope.py b/tests/cpu/test_rope.py index 3e9a8575a..70f482476 100644 --- a/tests/cpu/test_rope.py +++ b/tests/cpu/test_rope.py @@ -7,7 +7,7 @@ class FusedROPETester(TestCase): def setUp(self): - self.batch = 1 + self.batch = 2 self.seq_len = 32 self.max_seq_len = 384 self.head_size = 256 @@ -76,7 +76,10 @@ def hf_forward( query, key, position_ids, embed_positions, offset=None, rotary_dim=None ): embed_positions = _get_embed_positions(embed_positions, position_ids) - sincos = embed_positions.squeeze()[position_ids] + repeated_position_ids = position_ids.unsqueeze(-1).repeat( + 1, 1, embed_positions.shape[-1] + ) + sincos = torch.gather(embed_positions, 1, repeated_position_ids) sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1) if rotary_dim < self.head_size: From 65d26b5e511a4b7de3a9b0ccbadb40b785ecc0b1 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 24 May 2024 15:28:36 +0800 Subject: [PATCH 19/21] Add readme for fast_bert example (#2795) (#2921) --- examples/cpu/features/fast_bert/README.md | 18 ++++++++++++++++++ .../cpu/tpp/fused_bert.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 examples/cpu/features/fast_bert/README.md diff --git a/examples/cpu/features/fast_bert/README.md b/examples/cpu/features/fast_bert/README.md new file mode 100644 index 000000000..51bbcdf19 --- /dev/null +++ b/examples/cpu/features/fast_bert/README.md @@ -0,0 +1,18 @@ +# Feature Description: + +`ipex.fast_bert` proposed a technique to speed up BERT workloads. Implementation leverages the idea from [Tensor Processing Primitives](https://arxiv.org/pdf/2104.05755.pdf). + +Currently `ipex.fast_bert` API is only well optimized for training. For inference, it ensures functionality, while to get peak perf, please use `ipex.optimize` API + torchscript. + +# Prerequisite: +Transformers 4.6.0 ~ 4.38.1 + +# Usage Example: +Training: +``` +python fast_bert_training_bf16.py +``` +Inference: +``` +python fast_bert_inference_bf16.py +``` diff --git a/intel_extension_for_pytorch/cpu/tpp/fused_bert.py b/intel_extension_for_pytorch/cpu/tpp/fused_bert.py index 03a513a08..1b0b293b7 100644 --- a/intel_extension_for_pytorch/cpu/tpp/fused_bert.py +++ b/intel_extension_for_pytorch/cpu/tpp/fused_bert.py @@ -1243,7 +1243,7 @@ def fast_bert(model, dtype=torch.float, optimizer=None, unpad=False): >>> model = ... >>> model.load_state_dict(torch.load(PATH)) >>> model.eval() - >>> optimized_model = ipex.tpp_bert(model, dtype=torch.bfloat16) + >>> optimized_model = ipex.fast_bert(model, dtype=torch.bfloat16) >>> # running evaluation step. >>> # bfloat16 training case. >>> optimizer = ... From 27315216181eb76ea095e13d458efe645d8db445 Mon Sep 17 00:00:00 2001 From: blzheng Date: Mon, 27 May 2024 11:35:43 +0800 Subject: [PATCH 20/21] fix compatibility issues with transformers (#2931) --- .../transformers/models/reference/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index c8b78fc34..b4f2cf1e1 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -15,6 +15,7 @@ import transformers try: + from transformers.generation.configuration_utils import GenerationConfig from transformers.modeling_attn_mask_utils import ( _prepare_4d_causal_attention_mask, ) @@ -27,7 +28,6 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) - from transformers.generation.configuration_utils import GenerationConfig except ImportError: pass From 64fb34a6cae21880e7e45b35b71aaae8aa060bf3 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Mon, 27 May 2024 13:32:17 +0900 Subject: [PATCH 21/21] update doc footer (#2933) --- docs/_static/custom.css | 4 ++++ docs/_templates/footer.html | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/_static/custom.css b/docs/_static/custom.css index b26690143..010bffac7 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -17,6 +17,10 @@ a#wap_dns { display: none; } +a#wap_nac { + display: none; +} + /* replace the copyright to eliminate the copyright symbol enforced by the ReadTheDocs theme */ div[role=contentinfo] { diff --git a/docs/_templates/footer.html b/docs/_templates/footer.html index 94c8435e1..92839d2d0 100644 --- a/docs/_templates/footer.html +++ b/docs/_templates/footer.html @@ -1,3 +1,3 @@ {% extends '!footer.html' %} {% block extrafooter %} {{super}} -

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
+

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
{% endblock %}