Added ALBERT v2 quantization with INC example (#1591)

* Add quantization to QA scripts * fix * Remove quantize bool field * Fix electra large accuracy * Update mkldnn to onednn * Accuracy fix * Add sphinx to dev requirments * remove print * change quantize_mode to proper one * fix round_to argument * Albert example Co-authored-by: Bartlomiej Gawrych <barlomiej.gawrych@intel.com> Co-authored-by: Bartlomiej Gawrych <bartlomiej.gawrych@intel.com>
dmlc · Dec 25, 2022 · 14553a0 · 14553a0
1 parent fecd3e1
commit 14553a0
Show file tree

Hide file tree

Showing 6 changed files with 1,103 additions and 14 deletions.
diff --git a/scripts/question_answering/albert_custom.yaml b/scripts/question_answering/albert_custom.yaml
@@ -0,0 +1,15 @@
+version: 1.0
+
+model:
+  name: albert_base_v2
+  framework: mxnet
+
+tuning:
+  strategy:
+    name: mycustom
+  accuracy_criterion:
+    relative:  0.02
+  exit_policy:
+    timeout: 0
+    max_trials: 1000
+  random_seed: 9527
diff --git a/scripts/question_answering/custom_strategy.py b/scripts/question_answering/custom_strategy.py
@@ -0,0 +1,176 @@
+import copy
+import numpy as np
+from collections import OrderedDict
+from neural_compressor.strategy.strategy import TuneStrategy, strategy_registry
+
+plot_operator_influence = True
+
+def calc_approx_error(expected_tensor: np.ndarray, observed_tensor: np.ndarray) -> float:
+    '''
+    Calculating relative error for one tensor
+    '''
+    error = observed_tensor - expected_tensor
+    absolute_error = np.abs(error)
+    mean_absolute_error = absolute_error.mean()
+    mean_expected_value = np.abs(expected_tensor).mean()
+    error = mean_absolute_error / mean_expected_value
+    return error
+
+
+def get_approx_errors(expected_tensors, observed_tensors):
+    '''
+    Calculating relative error for multiple tensors: Dict[tensors_name: str, tensor: np.ndarray]
+    '''
+    errors = {}
+    for node_name in observed_tensors.keys():
+        expected_tensor = expected_tensors[node_name][node_name]
+        observed_tensor = observed_tensors[node_name][node_name]
+        errors[node_name] = calc_approx_error(expected_tensor, observed_tensor)
+    return errors
+
+
+@strategy_registry
+class MyCustomTuneStrategy(TuneStrategy):
+    '''INC Custom strategy definition'''
+    def __init__(self, model, conf, q_dataloader, q_func=None,
+                 eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None):
+        super().__init__(
+            model,
+            conf,
+            q_dataloader,
+            q_func,
+            eval_dataloader,
+            eval_func,
+            dicts,
+            q_hooks)
+
+
+    def get_qtensors(self, quant_cfg, node_list):
+        '''
+        Generating quantized model based on configuration and capturing intermediate tensors
+        '''
+        qmodel = self.adaptor.quantize(quant_cfg, self.model, self.calib_dataloader)
+        tensors = self.adaptor.inspect_tensor(qmodel, self.calib_dataloader, node_list, [1]) # 1 is a batch index
+        return tensors['activation'][0] # we need to specify that we want activation (layer output) because INC stores also weight tensors
+                                        # 0 is the first batch
+    def next_tune_cfg(self):
+        FALLBACK_DTYPE = 'fp32'
+
+        # creating base configuration - all nodes are quantized and calibrated with minmax algorithm
+        best_cfg = {}
+        best_cfg['calib_iteration'] = int(self.calib_iter[0]) # number of batches for calibration
+        best_cfg['calib_sampling_size'] = int(self.calib_sampling_size[0]) # number of samples for calibration (multiplicity of batch)
+        nodes_cfg = OrderedDict()
+        nodes_cfg_idx = {}
+        for node_key, cfgs in self.opwise_tune_cfgs.items():
+            for i, cfg in enumerate(cfgs):
+                if cfg['activation']['algorithm'] == 'minmax':
+                    nodes_cfg_idx[node_key] = i
+                    break
+            nodes_cfg[node_key] = cfg
+        best_cfg['op'] = nodes_cfg
+
+        yield best_cfg
+
+        # If fully quantized model does not meet the requirements, we proceed to exclude some nodes
+
+        # Collecting tensors from the original model - expected tensors
+        node_list = [op_name for (op_name, op_type) in best_cfg['op'].keys()]
+        f32_tensors = self.adaptor.inspect_tensor(self.model, self.calib_dataloader, node_list, [1])
+        f32_tensors = f32_tensors['activation'][0]
+
+        # Collecting tensors from the fully quantized model
+        q_tensors = self.get_qtensors(best_cfg, node_list)
+        approx_errors = get_approx_errors(f32_tensors, q_tensors)
+
+        # best_cfg['op'] is an OrderedDict, which order of elements should correspond to their
+        # order in the computational graph
+        for node_key, cfg in best_cfg['op'].items():
+            # Node's key in INC is its name + its operator
+            node_name, node_op = node_key
+            # Checking what configuration options are available for this particular node
+            capabilities = self.opwise_tune_space[node_key]['activation']['dtype']
+            # If a particular node can be excluded from quanrtization ('fp32' in capabilities)
+            # and current error is bigger than threshold value, we check what accuracy improvement
+            # would be achieved by this exclusion
+            if FALLBACK_DTYPE in capabilities and approx_errors[node_name] > 0.06:
+                original_dtype = cfg['activation']['dtype']
+                cfg['activation']['dtype'] = FALLBACK_DTYPE # Exclude the node from quantization
+
+                # Collecting tensors for a new configuration with the current node excluded
+                q_tensors = self.get_qtensors(best_cfg, node_list)
+                # Calculating errors for the new configuration
+                new_approx_errors = get_approx_errors(f32_tensors, q_tensors)
+                # Calculating error differences for every node in a model
+                err_diffs = {}
+                for tensor_node_name in new_approx_errors.keys():
+                    diff = approx_errors[tensor_node_name] - new_approx_errors[tensor_node_name]
+                    err_diffs[tensor_node_name] = diff
+                err_diffs_arr = np.array(list(err_diffs.values()))
+
+                # If the sum of errors on the following layers is greater than the threshold value we
+                # keep the node excluded
+                threshold_sum_error_layers = err_diffs_arr.size * 0.01
+                if err_diffs_arr.sum() >= threshold_sum_error_layers:
+                    before = approx_errors
+                    after = approx_errors.copy()
+                    after.update(new_approx_errors)
+                    if plot_operator_influence:
+                        import matplotlib.pyplot as plt
+                        plt.figure()
+                        plt.plot(before.values(), marker='o', markersize=2.5, label='Before')
+                        plt.plot(after.values(), marker='o', markersize=2.5, label='After')
+                        plt.ylabel('Relative error')
+                        plt.xlabel('Layer')
+                        plt.legend()
+                        plt.savefig(f'{node_name}_error.png')
+
+                    approx_errors.update(new_approx_errors)
+                    nodes_cfg_idx.pop(node_key) # Mark node as not quantizable
+                else:
+                    cfg['activation']['dtype'] = original_dtype
+
+        yield best_cfg
+
+        # Choosing calibration algorithm (kl or minmax) for every node which was not excluded from quantization
+        for cfg in self.bayesian_configurations(best_cfg, nodes_cfg_idx):
+            yield cfg
+
+    def bayesian_params_to_tune_configs(self, params):
+        '''
+        Creating configuration from params - changing configurations' indexes for real configurations
+        '''
+        node_cfgs = {}
+        for node_key, configs in self.opwise_quant_cfgs.items():
+            if node_key in params:
+                value = int(params[node_key])
+                value = min(value, len(configs) - 1)
+                node_cfgs[node_key] = copy.deepcopy(configs[value])
+        return node_cfgs
+
+    def bayesian_configurations(self, cfg_base, params_base):
+        from neural_compressor.strategy.bayesian import BayesianOptimization
+
+        # For each node we specify the possible range of values (we treat them as a configurations' index)
+        pbounds = {}
+        for node_key, configs in self.opwise_quant_cfgs.items():
+            if node_key in params_base and len(configs) > 1:
+                pbounds[node_key] = (0, len(configs))
+
+        cfg = copy.deepcopy(cfg_base)
+        if len(pbounds) == 0: # if there is nothing to be optimized, we finish
+            cfg['op'].update(self.bayesian_params_to_tune_configs(params_base))
+            return
+
+        bayes_opt = BayesianOptimization(pbounds=pbounds, random_seed=self.cfg.tuning.random_seed)
+        bayes_opt._space.register(params_base, self.last_tune_result[0]) # registering the outcome of current configuration
+        while True:
+            # Generating next configuration
+            params = bayes_opt.gen_next_params()
+            cfg['op'].update(self.bayesian_params_to_tune_configs(params))
+            yield cfg
+            try:
+                # Registering the outcome
+                bayes_opt._space.register(params, self.last_tune_result[0])
+            except KeyError:
+                pass
diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py
@@ -180,6 +180,7 @@ def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
         self.answerable_scores.add(nn.Dense(2, flatten=False,
                                             weight_initializer=weight_initializer,
                                             bias_initializer=bias_initializer))
+        self.quantized_backbone = None
 
     def get_start_logits(self, contextual_embedding, p_mask):
         """
@@ -287,10 +288,14 @@ def forward(self, tokens, token_types, valid_length, p_mask, start_position):
             Shape (batch_size, sequence_length)
         answerable_logits
         """
+        backbone_net = self.backbone
+        if self.quantized_backbone != None:
+           backbone_net = self.quantized_backbone
+
         if self.use_segmentation:
-            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+            contextual_embeddings = backbone_net(tokens, token_types, valid_length)
         else:
-            contextual_embeddings = self.backbone(tokens, valid_length)
+            contextual_embeddings = backbone_net(tokens, valid_length)
         start_logits = self.get_start_logits(contextual_embeddings, p_mask)
         end_logits = self.get_end_logits(contextual_embeddings,
                                          np.expand_dims(start_position, axis=1),
@@ -337,11 +342,16 @@ def inference(self, tokens, token_types, valid_length, p_mask,
             The answerable logits. Here 0 --> answerable and 1 --> not answerable.
             Shape (batch_size, sequence_length, 2)
         """
+        backbone_net = self.backbone
+        if self.quantized_backbone != None:
+           backbone_net = self.quantized_backbone
+
         # Shape (batch_size, sequence_length, C)
         if self.use_segmentation:
-            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+            contextual_embeddings = backbone_net(tokens, token_types, valid_length)
         else:
-            contextual_embeddings = self.backbone(tokens, valid_length)
+            contextual_embeddings = backbone_net(tokens, valid_length)
+
         start_logits = self.get_start_logits(contextual_embeddings, p_mask)
         # The shape of start_top_index will be (..., start_top_n)
         start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,