Tests

vllm-project · Dec 23, 2024 · e381ef9 · e381ef9
1 parent 480247c
commit e381ef9
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 36 deletions.
diff --git a/...s/compression/recipes/sparse_24_int8.yaml → ...mers/compression/recipes/sparse_int8.yaml b/...s/compression/recipes/sparse_24_int8.yaml → ...mers/compression/recipes/sparse_int8.yaml
@@ -3,27 +3,35 @@ pruning_stage:
         SparseGPTModifier:
             sparsity: 0.5
             sequential_update: true
-            mask_structure: "2:4"
+            mask_structure: "0:0"
             targets: ['re:model.layers.\d*$']
-quant_stage:
+test_stage:
     quant_modifiers:
         QuantizationModifier:
             ignore: ["lm_head"]
             config_groups:
                 group_0:
                     weights:
                         num_bits: 8
-                        type: int
-                        strategy: tensor
-                        dynamic: false
+                        type: "int"
                         symmetric: true
+                        strategy: "tensor"
                     input_activations:
                         num_bits: 8
-                        type: int
-                        strategy: tensor
-                        dynamic: true
-                        symmetric: true
+                        type: "int"
+                        symmetric: false
+                        strategy: "tensor"
+                    output_activations: null
                     targets: ["Linear"]
+                group_1:
+                    weights:
+                        num_bits: 8
+                        type: "int"
+                        symmetric: true
+                        strategy: "tensor"
+                    input_activations: null
+                    output_activations: null
+                    targets: ["Embedding"]
     pruning_modifiers:
         ConstantPruningModifier:
             targets: [

diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
@@ -370,9 +370,9 @@ def test_model_shared_tensors_gpu(
     "model_stub, recipe, sparse_format, quant_format",
     [
         (
-            "Xenova/llama2.c-stories110M",
-            "tests/llmcompressor/transformers/compression/recipes/sparse_24_int8.yaml",
-            CompressionFormat.sparse_24.value,
+            "Xenova/llama2.c-stories15M",
+            "tests/llmcompressor/transformers/compression/recipes/sparse_int8.yaml",
+            CompressionFormat.sparse_bitmask.value,
             CompressionFormat.int_quantized.value,
         ),
     ],
@@ -445,30 +445,7 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm
         if key.endswith("weight") and quant_format != "dense":
             # we don't expect an exact match for compressed
             diff = torch.abs(dense_tensor - reconstructed_tensor)
-            assert not torch.any(
-                diff > 0.01
-            ).item(), f"{key} has a diff greater than 0.01"
+            assert not torch.any(diff > 0.01), f"Max diff: {torch.max(diff)}"
         else:
             assert torch.equal(dense_tensor, reconstructed_tensor)
     shutil.rmtree(tmp_path)
-
-
-# This parameterization should be added to the test_compressor_stacking test
-# once the lossy nature of FP8 compress-decompress is resolved.
-# Until then, this test is marked as xfail.
-@pytest.mark.xfail(reason="Known issue with FP8 compress-decompress")
-@pytest.mark.parametrize(
-    "model_stub, recipe, sparse_format, quant_format",
-    [
-        (
-            "Xenova/llama2.c-stories110M",
-            "tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml",
-            CompressionFormat.sparse_24.value,
-            CompressionFormat.float_quantized.value,
-        ),
-    ],
-)
-def test_compressor_stacking_fp8(
-    model_stub, recipe, sparse_format, quant_format, tmp_path
-):
-    test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tmp_path)