2020import  math 
2121import  os 
2222import  re 
23- from  collections  import  OrderedDict , UserDict ,  namedtuple 
23+ from  collections  import  OrderedDict , UserDict 
2424from  functools  import  partial 
2525
2626import  yaml 
@@ -1800,7 +1800,7 @@ def smooth_quant(
18001800                assert  folding , "IPEX version >= 2.1 is required for SmoothQuant folding=False." 
18011801
18021802        if  not  hasattr (self , "sq" ) or  force_re_smooth :
1803-             from  . torch_utils .smooth_quant  import  TorchSmoothQuant 
1803+             from  neural_compressor . adaptor . torch_utils .waq  import  TorchSmoothQuant 
18041804
18051805            self .sq  =  TorchSmoothQuant (
18061806                model ._model , dataloader = dataloader , example_inputs = self .example_inputs , q_func = self .q_func 
@@ -1813,17 +1813,18 @@ def smooth_quant(
18131813            kwargs ["percentile" ] =  percentile 
18141814        if  scales_per_op  is  not None :
18151815            kwargs ["scales_per_op" ] =  scales_per_op 
1816+         auto_alpha_args ["init_alpha" ] =  default_alpha 
18161817        model ._model  =  self .sq .transform (
18171818            alpha = alpha ,
18181819            folding = folding ,
18191820            calib_iter = calib_iter ,
18201821            weight_clip = weight_clip ,
1821-             default_alpha = default_alpha ,
18221822            auto_alpha_args = auto_alpha_args ,
18231823            ** kwargs ,
18241824        )
18251825        if  self .sq .record_max_info :
18261826            model .sq_max_info  =  self .sq .max_value_info 
1827+             model .sq_scale_info  =  self .sq .sq_scale_info 
18271828        return  model 
18281829
18291830    def  _apply_pre_optimization (self , model , tune_cfg , recover = False ):
@@ -1840,7 +1841,7 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False):
18401841        q_model  =  model ._model 
18411842        sq_max_info  =  model .sq_max_info 
18421843        if  sq_max_info :
1843-             from  . torch_utils .smooth_quant  import  TorchSmoothQuant 
1844+             from  neural_compressor . adaptor . torch_utils .waq  import  TorchSmoothQuant 
18441845
18451846            tsq  =  TorchSmoothQuant (q_model , None )
18461847            alpha  =  tune_cfg ["recipe_cfgs" ]["smooth_quant_args" ]["alpha" ]
@@ -1876,8 +1877,9 @@ def qdq_quantize(self, model, tune_cfg):
18761877            model: qdq quantized model. 
18771878        """ 
18781879        q_model  =  model ._model 
1880+         from  neural_compressor .adaptor .torch_utils .waq  import  get_module , set_module 
1881+ 
18791882        from  .torch_utils .model_wrapper  import  QDQLinear , SQLinearWrapper 
1880-         from  .torch_utils .smooth_quant  import  get_module , set_module 
18811883
18821884        smoothquant_scale_info  =  {}
18831885        fallback_op_name_list  =  []
@@ -3317,37 +3319,7 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
33173319        inplace  =  True  if  self .performance_only  else  False 
33183320
33193321        # fetch SmoothQuant scale info from pre-optimized model 
3320-         sq_max_info  =  model .sq_max_info 
3321-         if  sq_max_info :
3322-             smoothquant_scale_info  =  {}
3323-             from  .torch_utils .model_wrapper  import  SQLinearWrapper 
3324-             from  .torch_utils .smooth_quant  import  get_module 
3325- 
3326-             for  _ , info  in  sq_max_info .items ():
3327-                 alpha  =  info ["alpha" ]
3328-                 absorbed_layer  =  info ["absorbed_layer" ]
3329-                 input_minmax  =  info ["input_minmax" ]
3330-                 # for peft model,lora_B weights is 0. 
3331-                 weight_max  =  info ["weight_max" ]
3332-                 if  self .sq .weight_clip :
3333-                     weight_max  =  weight_max .clamp (min = 1e-5 )
3334-                 abs_input_max  =  torch .max (torch .abs (input_minmax [0 ]), torch .abs (input_minmax [1 ]))
3335-                 input_power  =  torch .pow (abs_input_max , alpha )
3336-                 weight_power  =  torch .pow (weight_max , 1  -  alpha )
3337-                 scale  =  torch .clip (input_power  /  weight_power , min = 1e-5 )
3338-                 for  op_name  in  absorbed_layer :
3339-                     module  =  copy .deepcopy (get_module (q_model ._model , op_name ))
3340-                     new_module  =  SQLinearWrapper (module , 1.0  /  scale , input_minmax , alpha )
3341-                     weight_scale  =  new_module ._get_weight_scale ()
3342-                     smoothquant_scale_info [op_name ] =  {
3343-                         "alpha" : new_module .alpha ,
3344-                         "input_scale_for_mul" : new_module .input_scale ,
3345-                         "input_scale_after_mul" : new_module .scale ,
3346-                         "input_zero_point_after_mul" : new_module .zero_point ,
3347-                         "input_dtype" : new_module .dtype ,
3348-                         "weight_scale_after_mul" : weight_scale ,
3349-                     }
3350-                     logger .debug (f"Current SmoothQuant alpha of { op_name } { alpha }  )
3322+         smoothquant_scale_info  =  model .sq_scale_info 
33513323
33523324        # Check save_qconf_summary part is a workaround for IPEX bug. 
33533325        # Sometimes the prepared model from get_op_capablitiy loss this attribute 
@@ -4795,7 +4767,7 @@ def teq_quantize(self, model, tune_cfg, dataloader, calib_func):
47954767
47964768        supported_layers  =  ["Linear" ]
47974769        if  folding :  # pragma: no cover 
4798-             from  . torch_utils .smooth_quant  import  GraphTrace 
4770+             from  neural_compressor . adaptor . torch_utils .waq  import  GraphTrace 
47994771
48004772            tg  =  GraphTrace ()
48014773            absorb_to_layer , _  =  tg .get_absorb_to_layer (model , self .example_inputs , supported_layers )
0 commit comments