-
Notifications
You must be signed in to change notification settings - Fork 6.1k
[Quantization] Add TRT-ModelOpt as a Backend #11173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d7ca877
a016c56
eb73ab0
7fdb79e
a83bb98
9d9f0b9
7b09750
4fe06ee
71d8a7e
6c74c69
10fb9fe
6c65138
4b32567
915dbf0
3336a08
1c470f2
f823a2c
e78841e
8f88f29
212603f
24f1bcb
65097f1
97f94ae
752544f
415901f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .modelopt_quantizer import NVIDIAModelOptQuantizer |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
from typing import TYPE_CHECKING, Any, Dict, List, Union | ||
|
||
from ...utils import ( | ||
get_module_from_name, | ||
is_accelerate_available, | ||
is_nvidia_modelopt_available, | ||
is_nvidia_modelopt_version, | ||
is_torch_available, | ||
logging, | ||
) | ||
from ..base import DiffusersQuantizer | ||
|
||
|
||
if TYPE_CHECKING: | ||
from ...models.modeling_utils import ModelMixin | ||
|
||
|
||
if is_torch_available(): | ||
import torch | ||
|
||
if is_accelerate_available(): | ||
from accelerate.utils import set_module_tensor_to_device | ||
|
||
|
||
logger = logging.get_logger(__name__) | ||
|
||
|
||
class NVIDIAModelOptQuantizer(DiffusersQuantizer): | ||
r""" | ||
Diffusers Quantizer for TensorRT Model Optimizer | ||
""" | ||
|
||
use_keep_in_fp32_modules = True | ||
requires_calibration = False | ||
required_packages = ["modelopt"] | ||
|
||
def __init__(self, quantization_config, **kwargs): | ||
super().__init__(quantization_config, **kwargs) | ||
|
||
def validate_environment(self, *args, **kwargs): | ||
if not is_nvidia_modelopt_available(): | ||
raise ImportError( | ||
"Loading an nvidia-modelopt quantized model requires nvidia-modelopt library (`pip install nvidia-modelopt`)" | ||
) | ||
if not is_nvidia_modelopt_version(">=", "0.25.0"): | ||
raise ImportError( | ||
"Loading an nvidia-modelopt quantized model requires `nvidia-modelopt>=0.25.0`. " | ||
"Please upgrade your installation with `pip install --upgrade nvidia-modelopt" | ||
) | ||
|
||
self.offload = False | ||
|
||
device_map = kwargs.get("device_map", None) | ||
if isinstance(device_map, dict): | ||
if "cpu" in device_map.values() or "disk" in device_map.values(): | ||
if self.pre_quantized: | ||
raise ValueError( | ||
"You are attempting to perform cpu/disk offload with a pre-quantized modelopt model " | ||
"This is not supported yet. Please remove the CPU or disk device from the `device_map` argument." | ||
) | ||
else: | ||
self.offload = True | ||
|
||
def check_if_quantized_param( | ||
self, | ||
model: "ModelMixin", | ||
param_value: "torch.Tensor", | ||
param_name: str, | ||
state_dict: Dict[str, Any], | ||
**kwargs, | ||
): | ||
# ModelOpt imports diffusers internally. This is here to prevent circular imports | ||
from modelopt.torch.quantization.qtensor import BaseQuantizedTensor | ||
from modelopt.torch.quantization.utils import is_quantized | ||
|
||
module, tensor_name = get_module_from_name(model, param_name) | ||
if self.pre_quantized and any(isinstance(module, t) for t in [BaseQuantizedTensor]): | ||
return True | ||
elif is_quantized(module) and "weight" in tensor_name: | ||
return True | ||
return False | ||
|
||
def create_quantized_param( | ||
self, | ||
model: "ModelMixin", | ||
param_value: "torch.Tensor", | ||
param_name: str, | ||
target_device: "torch.device", | ||
*args, | ||
**kwargs, | ||
): | ||
""" | ||
Create the quantized parameter by calling .calibrate() after setting it to the module. | ||
""" | ||
# ModelOpt imports diffusers internally. This is here to prevent circular imports | ||
import modelopt.torch.quantization as mtq | ||
|
||
dtype = kwargs.get("dtype", torch.float32) | ||
module, tensor_name = get_module_from_name(model, param_name) | ||
if self.pre_quantized: | ||
setattr(module, tensor_name, param_value) | ||
else: | ||
set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) | ||
mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.forward_loop) | ||
mtq.compress(module) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mtq.compress compresses the model weights into lower-bit representations, allowing users to leverage it directly at the Torch level. However, as previously mentioned, to achieve actual speed improvements, we need to utilize the TensorRT runtime rather than the Torch runtime. |
||
module.weight.requires_grad = False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, this should actually be part of ModelOpt. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you meant L113-116 should be a part of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @realAsma, any specific reason why you think it should be a part of modelOpt. The reason why I kept it separate following the convention followed in diffusers We initially want to process and add all quant layers and then we want to actually calibrate/compress modules. There are two methods inside the let me know if I am missing something do you mean using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please ignore my comments. That was an internal note for ModelOpt. |
||
|
||
def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: | ||
max_memory = {key: val * 0.90 for key, val in max_memory.items()} | ||
return max_memory | ||
|
||
def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": | ||
if self.quantization_config.quant_type == "FP8": | ||
target_dtype = torch.float8_e4m3fn | ||
return target_dtype | ||
|
||
def update_torch_dtype(self, torch_dtype: "torch.dtype" = None) -> "torch.dtype": | ||
if torch_dtype is None: | ||
logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.") | ||
torch_dtype = torch.float32 | ||
return torch_dtype | ||
|
||
def _process_model_before_weight_loading( | ||
self, | ||
model: "ModelMixin", | ||
device_map, | ||
keep_in_fp32_modules: List[str] = [], | ||
**kwargs, | ||
): | ||
# ModelOpt imports diffusers internally. This is here to prevent circular imports | ||
import modelopt.torch.opt as mto | ||
|
||
modules_to_not_convert = self.quantization_config.modules_to_not_convert | ||
|
||
if modules_to_not_convert is None: | ||
modules_to_not_convert = [] | ||
if isinstance(modules_to_not_convert, str): | ||
modules_to_not_convert = [modules_to_not_convert] | ||
modules_to_not_convert.extend(keep_in_fp32_modules) | ||
|
||
for module in modules_to_not_convert: | ||
self.quantization_config.modelopt_config["quant_cfg"]["*" + module + "*"] = {"enable": False} | ||
self.quantization_config.modules_to_not_convert = modules_to_not_convert | ||
|
||
mto.apply_mode(model, mode=[("quantize", self.quantization_config.modelopt_config)]) | ||
model.config.quantization_config = self.quantization_config | ||
|
||
def _process_model_after_weight_loading(self, model, **kwargs): | ||
return model | ||
|
||
@property | ||
def is_trainable(self): | ||
return True | ||
|
||
@property | ||
def is_serializable(self): | ||
return True |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think only torch optional deps are needed here. Diffusers is anyways installed here and I think diffusers depending indirectly on transformers, accelerate, datasets, etc hf dependencies doesn't make sense.
nvidia-modelopt[hf]
might pin the dependencies differently than here which may potentially cause conflictsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
transformers
,datasets
are not required packagesdiffusers
just as an FYI. We prefer keeping our dependencies lean.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
keeping this as it is for now, lmkyt ?