|
31 | 31 | }
|
32 | 32 |
|
33 | 33 |
|
34 |
| -def load(output_dir="./saved_results", model=None): |
35 |
| - from neural_compressor.common.base_config import ConfigRegistry |
| 34 | +def load(model_name_or_path="./saved_results", model=None, format="default", *hf_model_args, **hf_model_kwargs): |
| 35 | + """Load quantized model. |
36 | 36 |
|
37 |
| - qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "qconfig.json") |
38 |
| - with open(qconfig_file_path, "r") as f: |
39 |
| - per_op_qconfig = json.load(f) |
| 37 | + Args: |
| 38 | + model_name_or_path (str, optional): local path where quantized weights or model are saved |
| 39 | + or huggingface model id. Defaults to "./saved_results". |
| 40 | + model (torch.nn.Module, optional): original model. Require to pass when loading INC WOQ quantized model |
| 41 | + or loading FP8 model. Defaults to None. |
| 42 | + format (str, optional): 'defult' for loading INC quantized model. |
| 43 | + 'huggingface' now only for loading huggingface WOQ causal language model. Defaults to "default". |
40 | 44 |
|
41 |
| - if " " in per_op_qconfig.keys(): # ipex qconfig format: {' ': {'q_op_infos': {'0': {'op_type': ... |
42 |
| - from neural_compressor.torch.algorithms.static_quant import load |
| 45 | + Returns: |
| 46 | + torch.nn.Module: quantized model |
| 47 | + """ |
| 48 | + if format == "default": |
| 49 | + from neural_compressor.common.base_config import ConfigRegistry |
| 50 | + from neural_compressor.torch.algorithms.static_quant import load as static_quant_load |
| 51 | + from neural_compressor.torch.algorithms.weight_only.save_load import load as woq_load |
| 52 | + from neural_compressor.torch.algorithms.habana_fp8 import load as habana_fp8_load |
43 | 53 |
|
44 |
| - return load(output_dir) |
45 |
| - else: |
46 |
| - config_mapping = load_config_mapping(qconfig_file_path, ConfigRegistry.get_all_configs()["torch"]) |
47 |
| - # select load function |
48 |
| - config_object = config_mapping[next(iter(config_mapping))] |
49 |
| - if isinstance(config_object, (RTNConfig, GPTQConfig, AWQConfig, TEQConfig, AutoRoundConfig)): # WOQ |
50 |
| - from neural_compressor.torch.algorithms.weight_only.save_load import load |
| 54 | + qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(model_name_or_path)), "qconfig.json") |
| 55 | + with open(qconfig_file_path, "r") as f: |
| 56 | + per_op_qconfig = json.load(f) |
| 57 | + |
| 58 | + if " " in per_op_qconfig.keys(): # ipex qconfig format: {' ': {'q_op_infos': {'0': {'op_type': ... |
| 59 | + return static_quant_load(model_name_or_path) |
| 60 | + else: |
| 61 | + config_mapping = load_config_mapping(qconfig_file_path, ConfigRegistry.get_all_configs()["torch"]) |
| 62 | + # select load function |
| 63 | + config_object = config_mapping[next(iter(config_mapping))] |
51 | 64 |
|
52 |
| - return load(output_dir) |
| 65 | + if isinstance(config_object, (RTNConfig, GPTQConfig, AWQConfig, TEQConfig, AutoRoundConfig)): # WOQ |
| 66 | + return woq_load(model_name_or_path, model=model, format=format) |
53 | 67 |
|
54 |
| - model.qconfig = config_mapping |
55 |
| - if isinstance(config_object, FP8Config): # FP8 |
56 |
| - from neural_compressor.torch.algorithms.habana_fp8 import load |
| 68 | + model.qconfig = config_mapping |
| 69 | + if isinstance(config_object, FP8Config): # FP8 |
| 70 | + return habana_fp8_load(model, model_name_or_path) |
| 71 | + elif format == "huggingface": |
| 72 | + # now only support load huggingface WOQ causal language model |
| 73 | + from neural_compressor.torch.algorithms.weight_only.save_load import load as woq_load |
57 | 74 |
|
58 |
| - return load(model, output_dir) # pylint: disable=E1121 |
| 75 | + return woq_load(model_name_or_path, format=format, *hf_model_args, **hf_model_kwargs) |
| 76 | + else: |
| 77 | + raise ValueError("`format` in load function can only be 'huggingface' or 'default', but get {}".format(format)) |
0 commit comments