From 9494bb8ea5affd69c55bd8ec90d23272b7c91604 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Fri, 5 Jan 2024 15:50:18 +0800 Subject: [PATCH] Change the calib dataset to pile-10k Signed-off-by: Zhang, Weiwei1 --- .../language-modeling/pruning/eager/README.md | 71 +++++++++++++------ .../pruning/eager/run_clm_sparsegpt.py | 22 ++++-- .../eager/scripts/run_llm_sparsegpt.sh | 6 +- 3 files changed, 67 insertions(+), 32 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/README.md index 9fae63388c0..019bcb9c365 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/README.md @@ -44,9 +44,18 @@ IntelĀ® Neural Compressor provides support for pruning and model slimming operat Through experimental verification, it has been observed that pruning the Multi-Layer Perceptron (MLP) layers using a channel-wise pattern can achieve a sparsity level of 10%-20%. This pruning technique speeds up inference while maintaining an accuracy drop of less than 1%. [Retrain-free Example](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_no_trainer.py). -The pruning patterns of 1x1 and N:M are supported through the use of the [SparseGPT Example](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py), It is possible to prune models up to 70B in size within two hours, achieving a sparsity of 40%-50% in both the Multi-Head Attention (MHA) and MLP layers. For models of 7B and above, the drop in accuracy is less than 1%. +The pruning patterns of 1x1 and N:M are supported through the use of the [SparseGPT Example](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py), It is possible to prune models up to 70B in size within two hours, achieving a sparsity of 40%-60% in both the Multi-Head Attention (MHA) and MLP layers. For models of 7B and above, the drop in accuracy is less than 1%. +Note that Pruning for models at 30 billion parameters and above can be done on a single GPU card (such as the A100), while evaluation is recommended to be performed using multiple cards: +```shell + CUDA_VISIBLE_DEVICES=0,1 \ + python examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py \ + --model_name_or_path /PATH/TO/SPARSE/LLM/ \ + --device=0 \ + --eval_dtype 'bf16' \ + --per_device_eval_batch_size 2 +``` -Pruning scripts are available for LLM sparse models such as GPT-j, BLOOM, OPT, LLaMA, and the sparse model can be obtained by modifying the pruning parameters. [Pruning Scripts](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/). +Pruning scripts are available for LLM sparse models such as GPT-j, BLOOM, OPT, LLaMA, Qwen, Chatglm, Mpt, Falcon, and the sparse model can be obtained by modifying the pruning parameters. [Pruning Scripts](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/).
@@ -71,27 +80,33 @@ The last word acc of the channel-wise sparse model is shown in the following tab | bigscience/bloom-7b1 | CLM | pile_10k | lambada_openai | BF16 | 0.5723 | 0.5756 | 0.58% | - ## SparseGPT Results The last word acc of the 1x1 pattern sparse model using [the sparseGPT script](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh) is shown in the following table. + | Model | Task | Calibration dataset | Evaluation dataset | Sparsity | Precision | Dense last word accuracy | Sparse last word accuracy | Relative drop | | :----: | :----: | :----: | :----: | :----: | :----: | :----: |:----: |:----:| -| meta-llama/Llama-2-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 30% | FP32 | 0.7392 | 0.7320 | -0.97% | -| meta-llama/Llama-2-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 30% | BF16 | 0.7365 | 0.7304 | -1.19% | -| EleutherAI/gpt-j-6b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6831 | 0.6922 | +1.33% | -| EleutherAI/gpt-j-6b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6771 | 0.6874 | +0.63% | -| decapoda-research/llama-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7361 | 0.7332 | -0.39% | -| decapoda-research/llama-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.7326 | 0.7297 | -0.87% | -| facebook/opt-6.7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6769 | 0.6616 | -2.26% | -| facebook/opt-6.7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6730 | 0.6577 | -2.84% | -| tiiuae/falcon-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7467 | 0.7528 | +0.82% | -| tiiuae/falcon-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.7464 | 0.7502 | +0.47% | -| bigscience/bloom-7b1 | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.5764 | 0.5606 | -2.74% | -| bigscience/bloom-7b1 | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.5725 | 0.5587 | -3.07% | -| mosaicml/mpt-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7056 | 0.7035 | -0.30% | -| mosaicml/mpt-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6831 | 0.6856 | -2.83% | +| EleutherAI/gpt-j-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.6831 | 0.6922 | +2.30% | +| EleutherAI/gpt-j-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6781 | 0.6874 | +1.48% | +| meta-llama/Llama-2-7b-hf | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7392 | 0.7411 | +0.26% | +| meta-llama/Llama-2-7b-hf | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7361 | 0.7376 | -0.22% | +| huggyllama/llama-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7361 | 0.7450 | +1.21% | +| huggyllama/llama-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7308 | 0.7427 | +0.90% | +| facebook/opt-6.7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.6769 | 0.6897 | +1.89% | +| facebook/opt-6.7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6765 | 0.6856 | +1.29% | +| tiiuae/falcon-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7467 | 0.7555 | +1.18% | +| tiiuae/falcon-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7467 | 0.7561 | +1.26% | +| bigscience/bloom-7b1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.5764 | 0.5768 | +0.07% | +| bigscience/bloom-7b1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.5731 | 0.5738 | -0.45% | +| mosaicml/mpt-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7056 | 0.7114 | +0.82% | +| mosaicml/mpt-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6831 | 0.6920 | -1.93% | +| THUDM/chatglm3-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.5888 | 0.5822 | -1.12% | +| THUDM/chatglm3-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.5878 | 0.5812 | -1.29% | +| mistralai/Mistral-7B-v0.1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7590 | 0.7803 | +2.81% | +| mistralai/Mistral-7B-v0.1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7561 | 0.7770 | +2.37% | +| Qwen/Qwen-7B | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.6996 | 0.7085 | +1.27% | +| Qwen/Qwen-7B | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6959 | 0.7077 | +1.16% | | mosaicml/mpt-7b-chat | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6550 | 0.6561 | +0.17% | | mosaicml/mpt-7b-chat | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6456 | 0.6451 | -1.51% | | meta-llama/Llama-2-13b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7679 | 0.7629 | -0.65% | @@ -100,16 +115,30 @@ The last word acc of the 1x1 pattern sparse model using [the sparseGPT script](h | decapoda-research/llama-13b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 50% | BF16 | 0.7599 | 0.7559 | -0.89% | | meta-llama/Llama-2-70b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | FP32 | 0.7964 | 0.7951 | -0.16% | | meta-llama/Llama-2-70b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | BF16 | 0.7937 | 0.7943 | -0.26% | -| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | FP32 | - | - | - | -| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | BF16 | 0.7673 | 0.7813 | - | - +| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | FP32 | 0.7702 | 0.7859 | +2.04% | +| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | BF16 | 0.7673 | 0.7813 | +1.44% | + + ## References [1] Kwon, W., Kim, S., Mahoney, M.W., Hassoun, J., Keutzer, K. and Gholami, A., 2022. A fast post-training pruning framework for transformers. Advances in Neural Information Processing Systems, 35, pp.24101-24116. -[2] Frantar, E. and Alistarh, D., Sparsegpt: Massive language models can be accurately pruned in one-shot, 2023. URL https://arxiv. org/abs/2301.00774. +[2] Frantar, E. and Alistarh, D., 2023, July. Sparsegpt: Massive language models can be accurately pruned in one-shot. In International Conference on Machine Learning (pp. 10323-10337). PMLR. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py index 970580dc63b..14f70ff3d9e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py @@ -49,9 +49,8 @@ def skip(*args, **kwargs): from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version from timers import CPUTimer, GPUTimer -from neural_compressor.training import WeightPruningConfig -from neural_compressor.compression.pruner import (prepare_pruning, - parse_auto_slim_config) +from neural_compressor.training import WeightPruningConfig, prepare_pruning +from neural_compressor.compression.pruner import (parse_auto_slim_config) from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate check_min_version("4.27.0.dev0") @@ -70,7 +69,7 @@ def parse_args(): parser.add_argument( "--calibration_dataset_name", type=str, - default="wikitext-2-raw-v1", + default="NeelNanda/pile-10k", # e.g. wikitext-2-raw-v1 help="The name of the pruning dataset to use (via the datasets library).", ) parser.add_argument( @@ -129,6 +128,12 @@ def parse_args(): default=16, help="Batch size (per device) for the evaluation dataloader.", ) + parser.add_argument( + "--calib_size", + type=int, + default=128, + help="sample size for the calibration dataset.", + ) parser.add_argument( "--learning_rate", type=float, @@ -403,8 +408,9 @@ def main(): from_tf=bool(".ckpt" in args.model_name_or_path), config=config, trust_remote_code=args.trust_remote_code, - low_cpu_mem_usage=args.low_cpu_mem_usage, + low_cpu_mem_usage=args.low_cpu_mem_usage ) + else: logger.info("Training new model from scratch") @@ -493,7 +499,7 @@ def group_texts(examples): train_dataset = lm_datasets["train"] # DataLoaders creation: - train_dataset = train_dataset.shuffle(seed=42).select(range(128)) + train_dataset = train_dataset.shuffle(seed=42).select(range(args.calib_size)) total_batch_size = args.per_device_train_batch_size if local_rank != -1: total_batch_size *= WORLD_SIZE @@ -544,8 +550,10 @@ def group_texts(examples): torch.backends.cudnn.allow_tf32 = False use_cache = model.config.use_cache model.config.use_cache = False - + import time + s = time.time() pruning = prepare_pruning(model, configs, dataloader=train_dataloader, device=device) + logger.info(f"cost time: {time.time() - s}") model.config.use_cache = use_cache if args.output_dir is not None: diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh index 915aef802b6..92b64264b4c 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh @@ -11,13 +11,11 @@ export CUBLAS_WORKSPACE_CONFIG=':4096:8' #cd neural-compressor python examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py \ --model_name_or_path /PATH/TO/LLM/ \ - --calibration_dataset_name wikitext-2-raw-v1 \ - --evaluation_dataset_name lambada \ --do_prune \ --device=0 \ --output_dir=/PATH/TO/SAVE/ \ + --eval_dtype 'bf16' \ + --per_device_eval_batch_size 16 \ --target_sparsity 0.5 \ --pruning_pattern 1x1 - -