diff --git a/llama/quant_util.py b/llama/quant_util.py new file mode 100644 index 000000000..66c2e2b1a --- /dev/null +++ b/llama/quant_util.py @@ -0,0 +1,71 @@ +from copy import deepcopy +from dataclasses import dataclass + +import torch +import torch.ao.quantization.fx._decomposed +from typing import Optional + +EPS = torch.finfo(torch.float32).eps + +@dataclass +class TensorQConfig: + dtype: torch.dtype = torch.int8 + axis: int = -1 + quant_min: int = -128 + quant_max: int = 127 + symmetric_quant: bool = True + + +def _get_dtype_min_max(dtype: torch.dtype): + if dtype == torch.int8: + return -128, 127 + elif dtype == torch.uint8: + return 0, 127 + else: + assert False + +def _find_per_channel_min_max(x: torch.Tensor, axis: int): + x_dim = x.size() + new_axis_list = [i for i in range(len(x_dim))] + new_axis_list[axis] = 0 + new_axis_list[0] = axis + y = x.permute(new_axis_list) + y = torch.flatten(y, start_dim=1) + return torch.aminmax(y, dim=1) + +def _find_qparams(x: torch.Tensor, qconfig : TensorQConfig): + # Only support per-channel symmetric quant to int8 now + axis = qconfig.axis + dtype = qconfig.dtype + symmetric_quant = qconfig.symmetric_quant + quant_min = qconfig.quant_min + quant_max = qconfig.quant_max + assert axis >= 0 and axis < len(x.shape) + assert dtype == torch.int8 + min_val, max_val = _find_per_channel_min_max(x, axis) + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + scale = torch.ones(min_val_neg.size(), dtype=torch.float32) + if symmetric_quant: + max_val_pos = torch.max(-min_val_neg, max_val_pos) + scale = max_val_pos / (float(quant_max - quant_min) / 2) + eps = torch.zeros_like(scale).fill_(EPS) + scale = torch.max(scale, eps) + return scale, None + else: + assert symmetric_quant + +def _quantize_to_dtype(x: torch.Tensor, qconfig: TensorQConfig, + scale: torch.Tensor, + zero_point: Optional[torch.Tensor] = None): + if zero_point is None: + zero_point = torch.zeros_like(scale) + return torch.ops.quantized_decomposed.quantize_per_channel( + x, scale, zero_point, qconfig.axis, qconfig.quant_min, + qconfig.quant_max, qconfig.dtype + ) + +def quantize_tensor(x: torch.Tensor, qconfig : TensorQConfig): + scale, zp = _find_qparams(x, qconfig) + x_int = _quantize_to_dtype(x, qconfig, scale, zp) + return x_int, scale, zp diff --git a/llama/xla_model_parallel.py b/llama/xla_model_parallel.py index f0b323976..11d6a8094 100644 --- a/llama/xla_model_parallel.py +++ b/llama/xla_model_parallel.py @@ -1,3 +1,4 @@ +from copy import deepcopy from typing import Callable, Optional, List, Any import torch @@ -10,6 +11,8 @@ from fairscale.nn.model_parallel.utils import divide_and_check_no_remainder, split_tensor_along_last_dim +from .quant_util import TensorQConfig, quantize_tensor + import os USE_CUDA = os.environ.get('USE_CUDA', False) @@ -394,9 +397,8 @@ def __init__( if quant: self.weight = Parameter(torch.empty( (self.output_size_per_partition, self.in_features), - dtype=torch.int8), - requires_grad=False) - self.weight_scaler = Parameter(torch.zeros(1), requires_grad=False) + dtype=torch.int8), requires_grad=False) + self.weight_scaler = Parameter(torch.Tensor(self.output_size_per_partition)) else: self.weight = Parameter( torch.Tensor(self.output_size_per_partition, self.in_features)) @@ -427,6 +429,21 @@ def get_master_weight(self) -> torch.Tensor: self.weight.data.transpose(0, 1), self.groups, self.world_size, self.rank).transpose_(0, 1) + def quantize(self): + assert self.quant == False + fp_w = deepcopy(self.weight.data) + orig_dtype = fp_w.dtype + fp_w = fp_w.to(torch.float32) + self.weight = Parameter( + torch.empty((self.output_size_per_partition, self.in_features), dtype=torch.int8), + requires_grad=False, + ) + self.weight_scaler = Parameter(torch.Tensor(self.output_size_per_partition)) + qconfig = TensorQConfig(axis=0) + self.weight.data, scale, zero_point = quantize_tensor(fp_w, qconfig) + self.weight_scaler.data = scale.to(orig_dtype) + self.quant = True + def forward(self, input_: torch.Tensor) -> torch.Tensor: # type: ignore # Set up backprop all-reduce. input_parallel = copy_to_model_parallel_region(input_, self.groups, @@ -523,9 +540,8 @@ def __init__( if quant: self.weight = Parameter(torch.empty( (self.out_features, self.input_size_per_partition), - dtype=torch.int8), - requires_grad=False) - self.weight_scaler = Parameter(torch.zeros(1), requires_grad=False) + dtype=torch.int8), requires_grad=False) + self.weight_scaler = Parameter(torch.Tensor(self.out_features)) else: self.weight = Parameter( torch.Tensor(self.out_features, self.input_size_per_partition)) @@ -555,6 +571,21 @@ def get_master_weight(self) -> torch.Tensor: return gather_from_model_parallel_region(self.weight.data, self.groups, self.world_size, self.rank) + def quantize(self): + assert self.quant == False + fp_w = deepcopy(self.weight.data) + orig_dtype = fp_w.dtype + fp_w = fp_w.to(torch.float32) + self.weight = Parameter( + torch.empty((self.out_features, self.input_size_per_partition), dtype=torch.int8), + requires_grad=False, + ) + self.weight_scaler = Parameter(torch.Tensor(self.out_features)) + qconfig = TensorQConfig(axis=0) + self.weight.data, scale, zero_point = quantize_tensor(fp_w, qconfig) + self.weight_scaler.data = scale.to(orig_dtype) + self.quant = True + def forward(self, input_: torch.Tensor) -> torch.Tensor: # type:ignore # Set up backprop all-reduce. if self.input_is_parallel: diff --git a/reshard_checkpoints.py b/reshard_checkpoints.py index 86e147586..f488d79f3 100644 --- a/reshard_checkpoints.py +++ b/reshard_checkpoints.py @@ -17,7 +17,7 @@ @torch.no_grad() -def reshard(original_mp, target_mp, ckpt_dir, output_dir, tokenizer_path): +def reshard(original_mp, target_mp, ckpt_dir, output_dir, tokenizer_path, quantize=False): assert target_mp > original_mp > 0 factor = divide_and_check_no_remainder(target_mp, original_mp) @@ -109,6 +109,8 @@ def reshard(original_mp, target_mp, ckpt_dir, output_dir, tokenizer_path): factor)[shard_rank].contiguous() assert weight_shard.size() == module.weight.size() module.weight.copy_(weight_shard) + if quantize: + module.quantize() elif isinstance(module, ColumnParallelLinear): source_module = original_model.get_submodule(name) assert module.bias is None and source_module.bias is None @@ -122,14 +124,18 @@ def reshard(original_mp, target_mp, ckpt_dir, output_dir, tokenizer_path): factor // kv_head_duplicate)[shard_rank // kv_head_duplicate].transpose(0, 1).contiguous() assert weight_shard.size() == module.weight.size() module.weight.copy_(weight_shard) + if quantize: + module.quantize() state_dict = { k: v for k, v in target_model.state_dict().items() - if k in checkpoint.keys() + if k in checkpoint.keys() or "weight_scaler" in k # TODO: "weight_scaler" are new parameters after quant, add to state_dict in a more elegant way. } torch.save(state_dict, Path(output_dir) / f"{target_rank:03}.pth") + if quantize: + new_params['quant'] = True with open(Path(output_dir) / "params.json", "w") as f: json.dump(new_params, f)