intel · chensuyue · Jan 18, 2024 · Jan 5, 2024
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/README.md
@@ -44,9 +44,18 @@ Intel® Neural Compressor provides support for pruning and model slimming operat
 
 Through experimental verification, it has been observed that pruning the Multi-Layer Perceptron (MLP) layers using a channel-wise pattern can achieve a sparsity level of 10%-20%. This pruning technique speeds up inference while maintaining an accuracy drop of less than 1%. [Retrain-free Example](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_no_trainer.py).
 
-The pruning patterns of 1x1 and N:M are supported through the use of the [SparseGPT Example](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py), It is possible to prune models up to 70B in size within two hours, achieving a sparsity of 40%-50% in both the Multi-Head Attention (MHA) and MLP layers. For models of 7B and above, the drop in accuracy is less than 1%.
+The pruning patterns of 1x1 and N:M are supported through the use of the [SparseGPT Example](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py), It is possible to prune models up to 70B in size within two hours, achieving a sparsity of 40%-60% in both the Multi-Head Attention (MHA) and MLP layers. For models of 7B and above, the drop in accuracy is less than 1%.
+Note that Pruning for models at 30 billion parameters and above can be done on a single GPU card (such as the A100), while evaluation is recommended to be performed using multiple cards:
+```shell
+    CUDA_VISIBLE_DEVICES=0,1 \
+    python examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py \
+    --model_name_or_path /PATH/TO/SPARSE/LLM/ \
+    --device=0 \
+    --eval_dtype 'bf16' \
+    --per_device_eval_batch_size 2
+```
 
-Pruning scripts are available for LLM sparse models such as GPT-j, BLOOM, OPT, LLaMA, and the sparse model can be obtained by modifying the pruning parameters. [Pruning Scripts](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/).
+Pruning scripts are available for LLM sparse models such as GPT-j, BLOOM, OPT, LLaMA, Qwen, Chatglm, Mpt, Falcon, and the sparse model can be obtained by modifying the pruning parameters. [Pruning Scripts](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/).
 
 <br />
 
@@ -71,27 +80,33 @@ The last word acc of the channel-wise sparse model is shown in the following tab
 | bigscience/bloom-7b1 | CLM | pile_10k | lambada_openai | BF16 | 0.5723 | 0.5756 | 0.58% |
 
 
-
 ## SparseGPT Results
 
 The last word acc of the 1x1 pattern sparse model using [the sparseGPT script](https://github.com/intel/neural-compressor/tree/master/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh) is shown in the following table.
 
+
 | Model | Task | Calibration dataset | Evaluation dataset | Sparsity | Precision | Dense last word accuracy | Sparse last word accuracy | Relative drop |
 |  :----: | :----: | :----: | :----: | :----: | :----: | :----: |:----: |:----:|
-| meta-llama/Llama-2-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 30% | FP32 | 0.7392 | 0.7320 | -0.97% |
-| meta-llama/Llama-2-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 30% | BF16 | 0.7365 | 0.7304 | -1.19% |
-| EleutherAI/gpt-j-6b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6831 | 0.6922 | +1.33% |
-| EleutherAI/gpt-j-6b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6771 | 0.6874 | +0.63% |
-| decapoda-research/llama-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7361 | 0.7332 | -0.39% |
-| decapoda-research/llama-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.7326 | 0.7297 | -0.87% |
-| facebook/opt-6.7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6769 | 0.6616 | -2.26% |
-| facebook/opt-6.7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6730 | 0.6577 | -2.84% |
-| tiiuae/falcon-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7467 | 0.7528 | +0.82% |
-| tiiuae/falcon-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.7464 | 0.7502 | +0.47% |
-| bigscience/bloom-7b1 | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.5764 | 0.5606 | -2.74% |
-| bigscience/bloom-7b1 | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.5725 | 0.5587 | -3.07% |
-| mosaicml/mpt-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7056 | 0.7035 | -0.30% |
-| mosaicml/mpt-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6831 | 0.6856 | -2.83% |
+| EleutherAI/gpt-j-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.6831 | 0.6922 | +2.30% |
+| EleutherAI/gpt-j-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6781 | 0.6874 | +1.48% |
+| meta-llama/Llama-2-7b-hf | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7392 | 0.7411 | +0.26% |
+| meta-llama/Llama-2-7b-hf | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7361 | 0.7376 | -0.22% |
+| huggyllama/llama-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7361 | 0.7450 | +1.21% |
+| huggyllama/llama-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7308 | 0.7427 | +0.90% |
+| facebook/opt-6.7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.6769 | 0.6897 | +1.89% |
+| facebook/opt-6.7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6765 | 0.6856 | +1.29% |
+| tiiuae/falcon-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7467 | 0.7555 | +1.18% |
+| tiiuae/falcon-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7467 | 0.7561 | +1.26% |
+| bigscience/bloom-7b1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.5764 | 0.5768 | +0.07% |
+| bigscience/bloom-7b1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.5731 | 0.5738 | -0.45% |
+| mosaicml/mpt-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7056 | 0.7114 | +0.82% |
+| mosaicml/mpt-7b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6831 | 0.6920 | -1.93% |
+| THUDM/chatglm3-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.5888 | 0.5822 | -1.12% |
+| THUDM/chatglm3-6b | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.5878 | 0.5812 | -1.29% |
+| mistralai/Mistral-7B-v0.1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.7590 | 0.7803 | +2.81% |
+| mistralai/Mistral-7B-v0.1 | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.7561 | 0.7770 | +2.37% |
+| Qwen/Qwen-7B | CLM | NeelNanda/pile-10k | lambada_openai | 40% | FP32 | 0.6996 | 0.7085 | +1.27% |
+| Qwen/Qwen-7B | CLM | NeelNanda/pile-10k | lambada_openai | 40% | BF16 | 0.6959 | 0.7077 | +1.16% |
 | mosaicml/mpt-7b-chat | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6550 | 0.6561 | +0.17% |
 | mosaicml/mpt-7b-chat | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6456 | 0.6451 | -1.51% |
 | meta-llama/Llama-2-13b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7679 | 0.7629 | -0.65% |
@@ -100,16 +115,30 @@ The last word acc of the 1x1 pattern sparse model using [the sparseGPT script](h
 | decapoda-research/llama-13b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 50% | BF16 | 0.7599 | 0.7559 | -0.89% |
 | meta-llama/Llama-2-70b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | FP32 | 0.7964 | 0.7951 | -0.16% |
 | meta-llama/Llama-2-70b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | BF16 | 0.7937 | 0.7943 | -0.26% |
-| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | FP32 | - | - | - |
-| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | BF16 | 0.7673 | 0.7813 | - |
-
+| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | FP32 | 0.7702 | 0.7859 | +2.04% |
+| Qwen/Qwen-72B | CLM | wikitext-2-raw-v1 | lambada_openai | 60% | BF16 | 0.7673 | 0.7813 | +1.44% |
 
+<!-- discarded data -->
+<!-- | meta-llama/Llama-2-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 30% | FP32 | 0.7392 | 0.7320 | -0.97% |
+| meta-llama/Llama-2-7b-hf | CLM | wikitext-2-raw-v1 | lambada_openai | 30% | BF16 | 0.7361 | 0.7304 | -1.19% |
+| EleutherAI/gpt-j-6b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6831 | 0.6922 | +1.33% |
+| EleutherAI/gpt-j-6b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6781 | 0.6874 | +0.63% |
+| huggyllama/llama-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7361 | 0.7332 | -0.39% |
+| huggyllama/llama-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.7308 | 0.7297 | -0.87% |
+| facebook/opt-6.7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.6769 | 0.6616 | -2.26% |
+| facebook/opt-6.7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6765 | 0.6577 | -2.84% |
+| tiiuae/falcon-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7467 | 0.7528 | +0.82% |
+| tiiuae/falcon-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.7467 | 0.7502 | +0.47% |
+| bigscience/bloom-7b1 | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.5764 | 0.5606 | -2.74% |
+| bigscience/bloom-7b1 | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.5731 | 0.5587 | -3.07% |
+| mosaicml/mpt-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | FP32 | 0.7056 | 0.7035 | -0.30% |
+| mosaicml/mpt-7b | CLM | wikitext-2-raw-v1 | lambada_openai | 40% | BF16 | 0.6831 | 0.6839 | -2.83% | -->
 
 ## References
 
 [1] Kwon, W., Kim, S., Mahoney, M.W., Hassoun, J., Keutzer, K. and Gholami, A., 2022. A fast post-training pruning framework for transformers. Advances in Neural Information Processing Systems, 35, pp.24101-24116.
 
-[2] Frantar, E. and Alistarh, D., Sparsegpt: Massive language models can be accurately pruned in one-shot, 2023. URL https://arxiv. org/abs/2301.00774.
+[2] Frantar, E. and Alistarh, D., 2023, July. Sparsegpt: Massive language models can be accurately pruned in one-shot. In International Conference on Machine Learning (pp. 10323-10337). PMLR.
 
 
 

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py
@@ -49,9 +49,8 @@ def skip(*args, **kwargs):
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 from timers import CPUTimer, GPUTimer
-from neural_compressor.training import WeightPruningConfig
-from neural_compressor.compression.pruner import (prepare_pruning,
-                                                    parse_auto_slim_config)
+from neural_compressor.training import WeightPruningConfig, prepare_pruning
+from neural_compressor.compression.pruner import (parse_auto_slim_config)
 from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
 
 check_min_version("4.27.0.dev0")
@@ -70,7 +69,7 @@ def parse_args():
     parser.add_argument(
         "--calibration_dataset_name",
         type=str,
-        default="wikitext-2-raw-v1",
+        default="NeelNanda/pile-10k", # e.g. wikitext-2-raw-v1
         help="The name of the pruning dataset to use (via the datasets library).",
     )
     parser.add_argument(
@@ -129,6 +128,12 @@ def parse_args():
         default=16,
         help="Batch size (per device) for the evaluation dataloader.",
     )
+    parser.add_argument(
+        "--calib_size",
+        type=int,
+        default=128,
+        help="sample size for the calibration dataset.",
+    )
     parser.add_argument(
         "--learning_rate",
         type=float,
@@ -403,8 +408,9 @@ def main():
                     from_tf=bool(".ckpt" in args.model_name_or_path),
                     config=config,
                     trust_remote_code=args.trust_remote_code,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage
                     )
+
 
     else:
         logger.info("Training new model from scratch")
@@ -493,7 +499,7 @@ def group_texts(examples):
     train_dataset = lm_datasets["train"]
 
     # DataLoaders creation:
-    train_dataset = train_dataset.shuffle(seed=42).select(range(128))
+    train_dataset = train_dataset.shuffle(seed=42).select(range(args.calib_size))
     total_batch_size = args.per_device_train_batch_size
     if local_rank != -1:
         total_batch_size *= WORLD_SIZE
@@ -544,8 +550,10 @@ def group_texts(examples):
         torch.backends.cudnn.allow_tf32 = False
         use_cache = model.config.use_cache
         model.config.use_cache = False
-
+        import time
+        s = time.time()
         pruning = prepare_pruning(model, configs, dataloader=train_dataloader, device=device)
+        logger.info(f"cost time: {time.time() - s}")
         model.config.use_cache = use_cache
 
     if args.output_dir is not None:

diff --git a/...torch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh b/...torch/nlp/huggingface_models/language-modeling/pruning/eager/scripts/run_llm_sparsegpt.sh
@@ -11,13 +11,11 @@ export CUBLAS_WORKSPACE_CONFIG=':4096:8'
 #cd neural-compressor
 python examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py \
     --model_name_or_path /PATH/TO/LLM/ \
-    --calibration_dataset_name wikitext-2-raw-v1 \
-    --evaluation_dataset_name lambada \
     --do_prune \
     --device=0 \
     --output_dir=/PATH/TO/SAVE/ \
+    --eval_dtype 'bf16' \
+    --per_device_eval_batch_size 16 \
     --target_sparsity 0.5 \
     --pruning_pattern 1x1
 
-
-