Merge pull request #19 from vllm-project/smoothquant-validation

[Bug] Fix validation errors for smoothquant modifier + update examples
vllm-project · Jul 8, 2024 · ac118ca · ac118ca
2 parents 94058dc + b7405af
commit ac118ca
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 34 deletions.
diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md
@@ -90,10 +90,14 @@ We first select the quantization algorithm. For W8A8, we want to:
 
 ```python
 from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-
-# Configure the quantization algorithm to run. This more complex scheme requires a YAML based recipe.
-recipe = "./recipe.yaml"
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+
+# Configure the quantization algorithms to run.
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
 
 # Apply quantization.
 oneshot(

diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
@@ -1,6 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoTokenizer
 
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
 
 # Select model and load it.
@@ -55,9 +57,11 @@ def tokenize(sample):
 #   * apply SmoothQuant to make the activations easier to quantize
 #   * quantize the weights to int8 with GPTQ (static per channel)
 #   * quantize the activations to int8 (dynamic per token)
-# Note: this scheme currently requires a more complex yaml recipe
 # Note: set sequential_update: true in the recipe to reduce memory
-recipe = "./recipe.yaml"
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
 
 # Apply algorithms.
 oneshot(

diff --git a/examples/quantization_w8a8_int8/recipe.yaml b/examples/quantization_w8a8_int8/recipe.yaml
diff --git a/src/llmcompressor/modifiers/smoothquant/base.py b/src/llmcompressor/modifiers/smoothquant/base.py
@@ -98,7 +98,7 @@ class SmoothQuantModifier(Modifier):
     num_calibration_steps: Optional[int] = None
     calibration_function: Optional[Callable] = None
 
-    hooks_: List = None
+    hooks_: Optional[List] = None
     resolved_mappings_: Optional[List] = None
     scales_: Optional[Dict] = None
 

diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -38,7 +38,7 @@ def save_pretrained_compressed(save_pretrained_method):
         def save_pretrained_wrapper(
             save_directory: str,
             sparsity_config: Optional[SparsityCompressionConfig] = None,
-            quantization_format: str = None,
+            quantization_format: Optional[str] = None,
             save_compressed: bool = False,
             skip_compression_stats: bool = False,
             **kwargs,