|  | 
|  | 1 | +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | 
|  | 2 | +# SPDX-License-Identifier: Apache-2.0 | 
|  | 3 | +# | 
|  | 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 5 | +# you may not use this file except in compliance with the License. | 
|  | 6 | +# You may obtain a copy of the License at | 
|  | 7 | +# | 
|  | 8 | +# http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 9 | +# | 
|  | 10 | +# Unless required by applicable law or agreed to in writing, software | 
|  | 11 | +# distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 13 | +# See the License for the specific language governing permissions and | 
|  | 14 | +# limitations under the License. | 
|  | 15 | +"""Implements INT8 quantization for efficient tensor storage and computation.""" | 
|  | 16 | + | 
|  | 17 | +from typing import Union | 
|  | 18 | + | 
|  | 19 | +import torch | 
|  | 20 | + | 
|  | 21 | +from ..qtensor.base_qtensor import BaseQuantizedTensor | 
|  | 22 | +from ..utils import ( | 
|  | 23 | +    convert_quantization_axis_to_reduce_axis, | 
|  | 24 | +    reduce_amax, | 
|  | 25 | +    reduce_block_amax, | 
|  | 26 | +    reduce_block_padding, | 
|  | 27 | +) | 
|  | 28 | + | 
|  | 29 | + | 
|  | 30 | +class INT8QTensor(BaseQuantizedTensor): | 
|  | 31 | +    """Implements the INT8 quantization on tensors for more efficient storage or computation. | 
|  | 32 | +
 | 
|  | 33 | +    Attributes: | 
|  | 34 | +        quantized_data (torch.Tensor): The quantized data stored as an INT8 tensor. | 
|  | 35 | +    """ | 
|  | 36 | + | 
|  | 37 | +    @classmethod | 
|  | 38 | +    def quantize( | 
|  | 39 | +        cls, | 
|  | 40 | +        input: torch.Tensor, | 
|  | 41 | +        scales: torch.Tensor = None, | 
|  | 42 | +        axis: Union[tuple, int, None] = None, | 
|  | 43 | +        block_sizes: dict = None, | 
|  | 44 | +    ) -> tuple: | 
|  | 45 | +        """Converting a tensor to a quantized format based on INT8 quantization. | 
|  | 46 | +
 | 
|  | 47 | +        Args: | 
|  | 48 | +            input (torch.Tensor): The input tensor to be quantized. | 
|  | 49 | +            scales (torch.Tensor): The scales for quantization. | 
|  | 50 | +            axis: The dimensions to reduce for quantization. None or int or tuple of ints. | 
|  | 51 | +            block_sizes (dict): A dictionary specifying the block size for each dimension. | 
|  | 52 | +        Note: One can only provide axis or block_sizes for INT8 quantization. | 
|  | 53 | +
 | 
|  | 54 | +        Returns: | 
|  | 55 | +            tuple: INT8QTensor, scales | 
|  | 56 | +        """ | 
|  | 57 | +        original_input = input | 
|  | 58 | +        if scales is None: | 
|  | 59 | +            if block_sizes: | 
|  | 60 | +                input = reduce_block_padding(input, block_sizes) | 
|  | 61 | +                amax = reduce_block_amax(input, block_sizes) | 
|  | 62 | +            else: | 
|  | 63 | +                reduce_axis = convert_quantization_axis_to_reduce_axis(input, axis) | 
|  | 64 | +                amax = reduce_amax(input, axis=reduce_axis) | 
|  | 65 | +            scales = amax / 127.0 | 
|  | 66 | + | 
|  | 67 | +        # Calculate the scale shape and make sure it aligns with input and block_sizes | 
|  | 68 | +        expected_shape = list(input.shape) | 
|  | 69 | +        expanded_scales = scales.clone() | 
|  | 70 | +        if block_sizes: | 
|  | 71 | +            for dim, block_size in block_sizes.items(): | 
|  | 72 | +                dim = dim if dim >= 0 else len(input.shape) + dim  # Convert negative index | 
|  | 73 | +                assert input.shape[dim] % block_size == 0, ( | 
|  | 74 | +                    f"Tensor dimension {dim}, {input.shape[dim]} is not divisible by {block_size}." | 
|  | 75 | +                ) | 
|  | 76 | +                expected_shape[dim] = ( | 
|  | 77 | +                    input.shape[dim] // block_size | 
|  | 78 | +                )  # Adjust expected shape for blocks | 
|  | 79 | + | 
|  | 80 | +            # Assert the shape of `scales` matches expected reduced dimensions | 
|  | 81 | +            assert scales.shape == tuple(expected_shape), ( | 
|  | 82 | +                f"Mismatch in expected scale shape: {scales.shape} vs {tuple(expected_shape)}" | 
|  | 83 | +            ) | 
|  | 84 | + | 
|  | 85 | +            # Expand scales for broadcasting | 
|  | 86 | +            for dim, block_size in block_sizes.items(): | 
|  | 87 | +                expanded_scales = expanded_scales.repeat_interleave(block_size, dim=dim) | 
|  | 88 | + | 
|  | 89 | +        # Quantization | 
|  | 90 | +        quantized_data = (input / expanded_scales).round().clamp(-128, 127).to(torch.int8) | 
|  | 91 | + | 
|  | 92 | +        return cls(original_input.shape, original_input.dtype, quantized_data), scales | 
|  | 93 | + | 
|  | 94 | +    def dequantize(self, dtype: torch.dtype = None, **kwarg): | 
|  | 95 | +        """Dequantize INT8 packed tensor to a target dtype.""" | 
|  | 96 | +        if dtype is None: | 
|  | 97 | +            dtype = self.metadata["dtype"] | 
|  | 98 | +        assert "scale" in kwarg, "Require scale for INT8 dequantization." | 
|  | 99 | + | 
|  | 100 | +        # Get args | 
|  | 101 | +        scales = kwarg["scale"] | 
|  | 102 | +        block_sizes = kwarg.get("block_sizes", None) | 
|  | 103 | + | 
|  | 104 | +        shape = self._quantized_data.shape | 
|  | 105 | +        if block_sizes: | 
|  | 106 | +            # Compute expanded shape for broadcasting scales | 
|  | 107 | +            expanded_shape = list(shape) | 
|  | 108 | +            for dim, block_size in block_sizes.items(): | 
|  | 109 | +                assert shape[dim] % block_size == 0, ( | 
|  | 110 | +                    f"Dimension {shape[dim]} is not divisible by {block_size}." | 
|  | 111 | +                ) | 
|  | 112 | +                expanded_shape[dim] //= block_size  # Reduce the dimension size for blocks | 
|  | 113 | + | 
|  | 114 | +            assert tuple(expanded_shape) == scales.shape, ( | 
|  | 115 | +                f"Scales shape {scales.shape} must match expected {tuple(expanded_shape)}." | 
|  | 116 | +            ) | 
|  | 117 | + | 
|  | 118 | +            # Expand scales for broadcasting | 
|  | 119 | +            for dim, block_size in block_sizes.items(): | 
|  | 120 | +                scales = scales.repeat_interleave(block_size, dim=dim) | 
|  | 121 | + | 
|  | 122 | +        # Handle padded tensors | 
|  | 123 | +        slices = tuple(slice(0, dim) for dim in self.metadata["shape"]) | 
|  | 124 | + | 
|  | 125 | +        return (self._quantized_data.view(torch.int8).to(dtype) * scales.to(dtype))[slices] | 
0 commit comments