From 880cbe7e9eb2a858a487353e24f9d1ebe77f7757 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 10 Nov 2025 03:26:25 +0000 Subject: [PATCH 1/3] implement propagate_error Signed-off-by: Kyle Sayers --- src/llmcompressor/args/dataset_arguments.py | 17 +++++++++++++---- .../pipelines/sequential/pipeline.py | 12 ++++++++---- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index 6f3c16fcf..03c854e98 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -241,10 +241,19 @@ class DatasetArguments(CustomDatasetArguments): quantization_aware_calibration: bool = field( default=True, metadata={ - "help": "Whether to enable quantization-aware calibration in the pipeline. " - "When True, quantization is applied during forward pass in calibration. " - "When False, quantization is disabled during forward pass in calibration. " - "Default is set to True." + "help": "Only relevant for the sequential pipeline. " + "If True, quantization is applied during forward pass in calibration. " + "If False, quantization is disabled during forward pass in calibration. " + "Default is True." + }, + ) + propagate_error: bool = field( + default=True, + metadata={ + "help": "Only relevant for the sequential pipeline. If True, use quantized " + "layer outputs as the inputs to the next sequential layer. If False, use " + "unquantized layer outputs as the inputs to the next sequential layer. " + "Default is True" }, ) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 91516f280..685be36d3 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -104,7 +104,11 @@ def __call__( # do a preliminary pass to trigger modifier hooks for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc): inputs = activations.fetch(batch_idx, subgraph.input_names) - subgraph.forward(model, **inputs) + outputs = subgraph.forward(model, **inputs) + + if not dataset_args.propagate_error: + activations.update(batch_idx, outputs) + activations.delete(batch_idx, subgraph.consumed_names) LifecycleCallbacks.sequential_epoch_end(subgraph) @@ -113,10 +117,10 @@ def __call__( with HooksMixin.disable_hooks(): for batch_idx in tqdm(range(len(dataloader)), desc=prop_desc): inputs = activations.fetch(batch_idx, subgraph.input_names) - output = subgraph.forward(model, **inputs) + outputs = subgraph.forward(model, **inputs) - if subgraph_index < num_subgraphs - 1: - activations.update(batch_idx, output) + if dataset_args.propagate_error: + activations.update(batch_idx, outputs) activations.delete(batch_idx, subgraph.consumed_names) # redundant, finish any remaining compression From 592dea955e000b6a8d2117ddead499117ecbef3c Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 10 Nov 2025 03:39:45 +0000 Subject: [PATCH 2/3] optimization Signed-off-by: Kyle Sayers --- .../pipelines/sequential/pipeline.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 685be36d3..0ece6c396 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -102,26 +102,27 @@ def __call__( # reduce memory movement by keeping modules onloaded with disable_offloading(): # do a preliminary pass to trigger modifier hooks - for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc): - inputs = activations.fetch(batch_idx, subgraph.input_names) + for b_idx in tqdm(range(len(dataloader)), desc=calib_desc): + inputs = activations.fetch(b_idx, subgraph.input_names) outputs = subgraph.forward(model, **inputs) if not dataset_args.propagate_error: - activations.update(batch_idx, outputs) - activations.delete(batch_idx, subgraph.consumed_names) + activations.update(b_idx, outputs) + activations.delete(b_idx, subgraph.consumed_names) LifecycleCallbacks.sequential_epoch_end(subgraph) - # this pass does not trigger modifier hooks - # and is only used for capturing outputs of newly compressed modules - with HooksMixin.disable_hooks(): - for batch_idx in tqdm(range(len(dataloader)), desc=prop_desc): - inputs = activations.fetch(batch_idx, subgraph.input_names) - outputs = subgraph.forward(model, **inputs) - - if dataset_args.propagate_error: - activations.update(batch_idx, outputs) - activations.delete(batch_idx, subgraph.consumed_names) + if not dataset_args.propagate_error: + # this pass does not trigger modifier hooks + # and is only used for capturing outputs of compressed modules + with HooksMixin.disable_hooks(): + for b_idx in tqdm(range(len(dataloader)), desc=prop_desc): + inputs = activations.fetch(b_idx, subgraph.input_names) + outputs = subgraph.forward(model, **inputs) + + if dataset_args.propagate_error: + activations.update(b_idx, outputs) + activations.delete(b_idx, subgraph.consumed_names) # redundant, finish any remaining compression LifecycleCallbacks.calibration_epoch_end() From 7c24a322e0ca5efba81cccfbd56145f3a81847f7 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 10 Nov 2025 03:40:09 +0000 Subject: [PATCH 3/3] fix typo Signed-off-by: Kyle Sayers --- src/llmcompressor/pipelines/sequential/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 0ece6c396..f11b6bbe6 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -112,7 +112,7 @@ def __call__( LifecycleCallbacks.sequential_epoch_end(subgraph) - if not dataset_args.propagate_error: + if dataset_args.propagate_error: # this pass does not trigger modifier hooks # and is only used for capturing outputs of compressed modules with HooksMixin.disable_hooks():