Skip to content

Commit

Permalink
Merge branch 'main' of github.com:vllm-project/llm-compressor
Browse files Browse the repository at this point in the history
  • Loading branch information
horheynm committed Dec 19, 2024
2 parents 3fdc10a + 7366a2d commit e38c749
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 9 deletions.
17 changes: 17 additions & 0 deletions src/llmcompressor/modifiers/smoothquant/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Callable, Dict, List, Optional, Tuple

import torch
from compressed_tensors.utils.offload import is_module_offloaded
from loguru import logger
from torch.nn import Module

Expand Down Expand Up @@ -282,6 +283,10 @@ def _apply_smoothing(self, model: Module):

@torch.no_grad()
def smooth(module):
offloaded = is_module_offloaded(module)
if offloaded:
module._hf_hook.pre_forward(module)

if module in balance_layers:
module.weight.mul_(scales.view(1, -1))
elif module == smooth_layer:
Expand All @@ -292,6 +297,9 @@ def smooth(module):
if hasattr(module, "bias") and module.bias is not None:
module.bias.div_(scales)

if offloaded:
module._hf_hook.post_forward(module, None)

parent = get_fsdp_parent(mapping.smooth_name, model)
if parent is not None:
parent.apply(smooth)
Expand All @@ -318,8 +326,16 @@ def _calculate_smoothing_scales(
# get the channel-wise dynamic range for each layer to be balanced
weight_scales = []
for layer in balance_layers:
offloaded = is_module_offloaded(layer)
if offloaded:
layer._hf_hook.pre_forward(layer)

scale = layer.weight.abs().max(dim=0, keepdim=True)[0]
weight_scales.append(scale)

if offloaded:
layer._hf_hook.post_forward(layer, None)

weight_scales = 2.0 * torch.cat(weight_scales, dim=0).max(dim=0)[0]

# calculate the amount of smoothing to apply
Expand All @@ -329,4 +345,5 @@ def _calculate_smoothing_scales(
1 - self.smoothing_strength
)
scales = torch.where(weight_scales > 0.0, scales, activation_scales)

return scales
4 changes: 2 additions & 2 deletions tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ scheme: FP8_DYNAMIC
num_fewshot: 5
limit: 1000
task: "gsm8k"
exact_match,flexible-extract: 0.753
exact_match,strict-match: 0.753
exact_match,flexible-extract: 0.75
exact_match,strict-match: 0.75
10 changes: 10 additions & 0 deletions tests/e2e/vLLM/lm_eval_configs/fp8_static_per_tensor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cadence: "weekly"
model: meta-llama/Meta-Llama-3-8B-Instruct
scheme: FP8
num_fewshot: 5
limit: 1000
task: "gsm8k"
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
exact_match,flexible-extract: 0.75
exact_match,strict-match: 0.75
11 changes: 7 additions & 4 deletions tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
cadence: "weekly"
model: meta-llama/Meta-Llama-3-8B-Instruct
scheme: INT8
scheme: INT8_dyn_per_token
recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
num_fewshot: 5
limit: 250
limit: 1000
task: "gsm8k"
exact_match,flexible-extract: 0.728
exact_match,strict-match: 0.728
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
exact_match,flexible-extract: 0.77
exact_match,strict-match: 0.76
11 changes: 11 additions & 0 deletions tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_weight.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cadence: "weekly"
model: meta-llama/Meta-Llama-3-8B-Instruct
recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
num_fewshot: 5
limit: 1000
task: "gsm8k"
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
exact_match,flexible-extract: 0.72
exact_match,strict-match: 0.72
scheme: W4A16_actorder_group
11 changes: 11 additions & 0 deletions tests/e2e/vLLM/lm_eval_configs/w4a16_grouped_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
cadence: "weekly"
model: meta-llama/Meta-Llama-3-8B-Instruct
num_fewshot: 5
limit: 1000
task: "gsm8k"
exact_match,flexible-extract: 0.72
exact_match,strict-match: 0.72
scheme: W4A16
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
quant_type: "GPTQ"
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
quant_stage:
quant_modifiers:
SmoothQuantModifier:
smoothing_strength: 0.8
GPTQModifier:
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true}
targets: [Linear]
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ quant_stage:
quant_modifiers:
SmoothQuantModifier:
smoothing_strength: 0.8
QuantizationModifier:
GPTQModifier:
ignore: [lm_head]
config_groups:
group_0:
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/vLLM/test_lmeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def set_up(self):
logger.info(self.scheme)

self.device = "cuda:0"
self.num_calibration_samples = 256
self.num_calibration_samples = 512
self.max_seq_length = 2048

def test_lm_eval(self):
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_lm_eval(self):

logger.info("================= Running LM Eval ======================")

model_args = f"pretrained={self.save_dir}"
model_args = f"pretrained={self.save_dir},add_bos_token=True"
results = lm_eval.simple_evaluate(
model="hf",
model_args=model_args,
Expand Down

0 comments on commit e38c749

Please sign in to comment.