Merge remote-tracking branch 'origin' into kylesayrs/move-sparsegptq

vllm-project · Dec 19, 2024 · 1ad71db · 1ad71db
2 parents 8513782 + 7366a2d
commit 1ad71db
Show file tree

Hide file tree

Showing 130 changed files with 1,150 additions and 1,287 deletions.
diff --git a/README.md b/README.md
@@ -57,6 +57,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API.
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 from llmcompressor.transformers import oneshot
+from transformers import AutoModelForCausalLM
 
 # Select quantization algorithm. In this case, we:
 #   * apply SmoothQuant to make the activations easier to quantize

diff --git a/examples/automodelforcausallm/README.md b/examples/automodelforcausallm/README.md
@@ -11,19 +11,3 @@ MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
 
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
 ```
-
-These models are still supported through the `SparseAutoModelForCausalLM` pathway:
-
-```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-
-MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
-model = SparseAutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-)
-```
-
-Models quantized through `llm-compressor` using `compressed-tensors=<v0.6.0` are not
-supported through the `AutoModelForCausalLM` and will still need the 
-`SparseAutoModelForCausalLM` pathway to run.
diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
@@ -14,13 +14,13 @@
 To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load.
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
 # device_map="auto" triggers usage of accelerate
 # if > 1 GPU, the model will be sharded across the GPUs
 # if not enough GPU memory to fit the model, parameters are offloaded to the CPU
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto")
 ```
 
@@ -34,12 +34,12 @@ potentially going out-of-memory.
 
 ```python
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
-from llmcompressor.transformers import SparseAutoModelForCausalLM,
+from transformers import AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
 # Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed)
 device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1)
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map=device_map,
     torch_dtype="auto",

diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
@@ -1,14 +1,14 @@
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 
 # Load model
 # Note: device_map="auto" will offload to CPU if not enough space on GPU.
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
 )
 

diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
@@ -1,22 +1,22 @@
 import torch
 from datasets import load_dataset
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
-MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407"
+MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 
 # adjust based off number of desired GPUs
 # reserve_for_hessians=True reserves memory which is required by
 # GPTQModifier and SparseGPTModifier
 device_map = calculate_offload_device_map(
-    MODEL_ID, num_gpus=2, reserve_for_hessians=True, torch_dtype=torch.bfloat16
+    MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
 )
 
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
@@ -1,14 +1,14 @@
 from datasets import load_dataset
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"
 
 # 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",

diff --git a/examples/compressed_inference/fp8_compressed_inference.py b/examples/compressed_inference/fp8_compressed_inference.py
@@ -1,13 +1,7 @@
-from transformers import AutoTokenizer
-
-from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 """
-This example covers how to load a quantized model in compressed mode. By default,
-SparseAutoModelForCausalLM will decompress the whole model on load resulting in no
-memory savings from quantization. By setting the `run_compressed` kwarg to True, the
-model will remain compressed in memory on load, saving memory during inference at the
-cost of increased runtime
+This example covers how to load a quantized model using AutoModelForCausalLM.
 
 During inference, each layer will be decompressed as needed before the forward pass.
 This saves memory as only a single layer is ever uncompressed at a time, but increases
@@ -25,9 +19,10 @@
     "def fibonacci(n):",
 ]
 
-# set run_compressed=True to enable running in compressed mode
-compressed_model = SparseAutoModelForCausalLM.from_pretrained(
-    MODEL_STUB, torch_dtype="auto", device_map="cuda:0", run_compressed=True
+compressed_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_STUB,
+    torch_dtype="auto",
+    device_map="cuda:0",
 )
 
 # tokenize the sample data

diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md
@@ -46,10 +46,10 @@ and quantize to 4 bits in one show using GPTQ.
 
 ```python
 import torch
-from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoModelForCausalLM
 
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     model_stub, torch_dtype=torch.bfloat16, device_map="auto"
 )
 
@@ -86,6 +86,7 @@ apply(
     lr_scheduler_type="cosine",
     warmup_ratio=0.1,
 )
+
 ```
 
 
@@ -96,10 +97,10 @@ run the following:
 
 ```python
 import torch
-from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoModelForCausalLM
 
 compressed_output_dir = "output_llama7b_2of4_w4a16_channel_compressed"
-model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
+model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
 model.save_pretrained(compressed_output_dir, save_compressed=True)
 ```
 

diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -1,13 +1,15 @@
 import torch
+from loguru import logger
+from transformers import AutoModelForCausalLM
 
-from llmcompressor.transformers import SparseAutoModelForCausalLM, apply
+from llmcompressor.transformers import apply
 
 # define a recipe to handle sparsity, finetuning and quantization
 recipe = "2of4_w4a16_recipe.yaml"
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     model_stub, torch_dtype=torch.bfloat16, device_map="auto"
 )
 
@@ -51,3 +53,7 @@
     lr_scheduler_type=lr_scheduler_type,
     warmup_ratio=warmup_ratio,
 )
+logger.info(
+    "Note: vLLM requires the dtype=torch.float16 when running the ",
+    "compressed marlin-24 model",
+)
diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
@@ -33,14 +33,13 @@ Let's walk through the main steps of the quantization process:
 
 ### 1. Load Model
 
-Load the model using `SparseAutoModelForCausalLM`:
+Load the model using `AutoModelForCausalLM`:
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",

diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -1,4 +1,5 @@
 from datasets import load_dataset
+from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.transformers import oneshot
@@ -81,6 +82,11 @@ def process_and_tokenize(example):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+logger.info(
+    "Running sample generation. ",
+    "Note: Inference with the quantized kv_cache is not supported. ",
+    "Please use vLLM for inference with the quantized kv_cache.",
+)
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")

diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md
@@ -34,14 +34,13 @@ Now, we will step though the code in the example. There are four steps:
 
 ### 1) Load Model
 
-Load the model using `SparseAutoModelForCausalLM`, which is a wrapper around `AutoModel` for handling quantized saving and loading. Note that `SparseAutoModel` is compatible with `accelerate` so you can load your model onto multiple GPUs if needed.
+Load the model using `AutoModelForCausalLM` for handling quantized saving and loading. 
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -1,13 +1,13 @@
 from datasets import load_dataset
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",

diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md
@@ -31,15 +31,14 @@ Now, we will step though the code in the example. There are three steps:
 
 ### 1) Load Model
 
-Load the model using `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM` for saving and loading quantized models.
+Load the model using `AutoModelForCausalLM`
 
 ```python
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
   MODEL_ID, device_map="auto", torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```

diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -1,12 +1,12 @@
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 
 MODEL_ID = "google/gemma-2-27b-it"
 
 # 1) Load model.
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -21,7 +21,12 @@
 
 # 3) Apply quantization and save in compressed-tensors format.
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR, tokenizer=tokenizer)
+oneshot(
+    model=model,
+    recipe=recipe,
+    tokenizer=tokenizer,
+    output_dir=OUTPUT_DIR,
+)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")

diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -1,13 +1,14 @@
 from transformers import AutoProcessor, MllamaForConditionalGeneration
 
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot, wrap_hf_model_class
+from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
 # Load model.
-model_class = wrap_hf_model_class(MllamaForConditionalGeneration)
-model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+model = MllamaForConditionalGeneration.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.
@@ -22,7 +23,11 @@
 
 # Apply quantization and save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
+oneshot(
+    model=model,
+    recipe=recipe,
+    output_dir=SAVE_DIR,
+)
 processor.save_pretrained(SAVE_DIR)
 
 # Confirm generations of the quantized model look sane.

diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -1,12 +1,12 @@
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 # Load model.
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -1,13 +1,14 @@
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot, wrap_hf_model_class
+from llmcompressor.transformers import oneshot
 
 MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 
 # Load model.
-model_class = wrap_hf_model_class(LlavaForConditionalGeneration)
-model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+model = LlavaForConditionalGeneration.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
 # Configure the quantization algorithm and scheme.