diff --git a/examples/nlp/language_modeling/conf/megatron_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml index 88d10ae0a66c..52454f5c8906 100644 --- a/examples/nlp/language_modeling/conf/megatron_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_quantization.yaml @@ -26,6 +26,7 @@ quantization: calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset num_calib_size: 512 # number of samples used for calibration awq_block_size: 128 # block size for scaling factors in AWQ algorithm + alpha: 1.0 # alpha parameter in SmoothQuant algorithm export: decoder_type: llama # gptnext, gpt2, llama diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 4748f4957a52..e25d529ec62c 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -116,6 +116,9 @@ def __init__( "axis": None, "enable": enable_quant_kv_cache, } + if quantization_config.algorithm == "int8_sq": + logging.info(f"Using int8_sq alpha = {quantization_config.alpha}") + quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": quantization_config.alpha} self.quant_cfg = quant_cfg else: