From 631e1ce3a7506d02e4b683f75e28848bda2a69d7 Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Tue, 22 Oct 2024 10:04:45 +0800
Subject: [PATCH 1/2] avoid deterministic algorithm warning in inference

---
 auto_round/autoround.py     |  7 +++----
 auto_round/calib_dataset.py | 38 +++++++++++++++++++------------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 256aba47..7ed64d1b 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -15,9 +15,6 @@
 import os
 import torch
 import transformers
-
-torch.use_deterministic_algorithms(True, warn_only=True)
-
 import copy
 import time
 from typing import Optional, Union
@@ -25,7 +22,7 @@
 from transformers import set_seed
 from torch import autocast
 from tqdm import tqdm
-from .calib_dataset import get_dataloader
+
 from .quantizer import WrapperMultiblock, wrapper_block, unwrapper_block, WrapperLinear, unwrapper_layer, \
     WrapperTransformerConv1d
 from .special_model_handler import (check_hidden_state_dim,
@@ -488,8 +485,10 @@ def calib(self, nsamples, bs):
             nsamples (int): The number of samples to use for calibration.
             bs (int): The number of samples to use for calibration
         """
+        from .calib_dataset import get_dataloader
         if isinstance(self.dataset, str):
             dataset = self.dataset.replace(" ", "")  ##remove all whitespaces
+
             # slow here
             self.dataloader = get_dataloader(
                 self.tokenizer,
diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
index 4c01302d..82dfa66c 100644
--- a/auto_round/calib_dataset.py
+++ b/auto_round/calib_dataset.py
@@ -16,6 +16,8 @@
 import random
 
 import torch
+
+torch.use_deterministic_algorithms(True, warn_only=True)
 from torch.utils.data import DataLoader
 
 from .utils import is_local_path, logger
@@ -58,7 +60,7 @@ def default_tokenizer_function(examples, apply_template=apply_template):
         if not apply_template:
             example = tokenizer(examples["text"], truncation=True, max_length=seqlen)
         else:
-            from jinja2 import Template # pylint: disable=E0401
+            from jinja2 import Template  # pylint: disable=E0401
             chat_template = tokenizer.chat_template if tokenizer.chat_template is not None \
                 else tokenizer.default_chat_template
             template = Template(chat_template)
@@ -66,7 +68,7 @@ def default_tokenizer_function(examples, apply_template=apply_template):
             for text in examples["text"]:
                 message = [{"role": "user", "content": text}]
                 rendered_message = template.render(messages=message, add_generation_prompt=True, \
-                    bos_token=tokenizer.bos_token)
+                                                   bos_token=tokenizer.bos_token)
                 rendered_messages.append(rendered_message)
             example = tokenizer(rendered_messages, truncation=True, max_length=seqlen)
         return example
@@ -103,11 +105,11 @@ def get_pile_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", split
 
 @register_dataset("madao33/new-title-chinese")
 def get_new_chinese_title_dataset(
-        tokenizer, 
-        seqlen, 
-        dataset_name="madao33/new-title-chinese", 
-        split=None, 
-        seed=42, 
+        tokenizer,
+        seqlen,
+        dataset_name="madao33/new-title-chinese",
+        split=None,
+        seed=42,
         apply_template=False
 ):
     """Returns a dataloader for the specified dataset and split.
@@ -148,7 +150,7 @@ def default_tokenizer_function(examples, apply_template=apply_template):
                 for text in examples["text"]:
                     message = [{"role": "user", "content": text}]
                     rendered_message = template.render(messages=message, add_generation_prompt=True, \
-                        bos_token=tokenizer.bos_token)
+                                                       bos_token=tokenizer.bos_token)
                     rendered_messages.append(rendered_message)
                 example = tokenizer(rendered_messages, truncation=True, max_length=seqlen)
             return example
@@ -267,12 +269,12 @@ def load_local_data(data_path):
 
 
 def get_dataloader(
-        tokenizer, 
-        seqlen, 
-        dataset_name="NeelNanda/pile-10k", 
-        seed=42, 
-        bs=8, 
-        nsamples=512, 
+        tokenizer,
+        seqlen,
+        dataset_name="NeelNanda/pile-10k",
+        seed=42,
+        bs=8,
+        nsamples=512,
 ):
     """Generate a DataLoader for calibration using specified parameters.
 
@@ -293,6 +295,7 @@ def get_dataloader(
     """
 
     dataset_names = dataset_name.split(",")
+
     def filter_func(example):
         if isinstance(example["input_ids"], list):
             example["input_ids"] = torch.tensor(example["input_ids"])
@@ -316,7 +319,7 @@ def concat_dataset_element(dataset):
                 input_id = input_id[1:]
                 os_cnt, have_bos = os_cnt + 1, True
             if input_id[-1] == eos_token_id:
-                input_id = input_id[:-1] 
+                input_id = input_id[:-1]
                 os_cnt, have_eos = os_cnt + 1, True
 
             if buffer_input_id.shape[-1] + input_id.shape[-1] + os_cnt > seqlen:
@@ -326,7 +329,7 @@ def concat_dataset_element(dataset):
                     input_id_to_append = [torch.tensor([bos_token_id])] + input_id_to_append
                 if have_eos:
                     input_id_to_append.append(torch.tensor([eos_token_id]))
-                
+
                 concat_input_ids.append(torch.cat(input_id_to_append).to(torch.int64))
                 attention_mask_list.append(attention_mask)
                 buffer_input_id = input_id[idx_keep:]
@@ -405,7 +408,7 @@ def concat_dataset_element(dataset):
         name = dataset_names[i].split(':')[0]
         if name not in data_lens:
             target_cnt = (nsamples - cnt) // (len(datasets) - len(data_lens)) if data_lens \
-                else (nsamples - cnt) // (len(datasets) - i) 
+                else (nsamples - cnt) // (len(datasets) - i)
             target_cnt = min(target_cnt, len(datasets[i]))
             cnt += target_cnt
         else:
@@ -447,4 +450,3 @@ def collate_batch(batch):
 
     calib_dataloader = DataLoader(dataset_final, batch_size=bs, shuffle=False, collate_fn=collate_batch)
     return calib_dataloader
-

From d769ca8e99d9e0410bbaef4b35986909fd69e5b9 Mon Sep 17 00:00:00 2001
From: wenhuach21 <wenhua.cheng@intel.com>
Date: Tue, 22 Oct 2024 10:18:20 +0800
Subject: [PATCH 2/2] update sym model

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index fbf5108f..58d3c52c 100644
--- a/README.md
+++ b/README.md
@@ -272,8 +272,8 @@ release most of the models ourselves.
 | meta-llama/Meta-Llama-3.1-8B-Instruct  | [model-kaitchup-autogptq-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-Instruct-autoround-gptq-4bit-asym), [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-Instruct-autoround-gptq-4bit-sym), [recipe](https://huggingface.co/Intel/Meta-Llama-3.1-8B-Instruct-int4-inc) |
 | meta-llama/Meta-Llama-3.1-8B           | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Meta-Llama-3.1-8B-autoround-gptq-4bit-sym)                                                                                                                                                                                                            |
 | Qwen/Qwen-VL                           | [accuracy](./examples/multimodal-modeling/Qwen-VL/README.md), [recipe](./examples/multimodal-modeling/Qwen-VL/run_autoround.sh)                                                                                                                                                                                           
-| Qwen/Qwen2-7B                          | [model-autoround-int4](https://huggingface.co/Intel/Qwen2-7B-int4-inc)                                                                                                                                                                                                                                                    |
-| Qwen/Qwen2-57B-A14B-Instruct           | [model-autoround-int4](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc)                                                                                                                                                                                                                                     |
+| Qwen/Qwen2-7B                          | [model-autoround-sym-int4](https://huggingface.co/Intel/Qwen2-7B-int4-inc), [model-autogptq-sym-int4](https://huggingface.co/Intel/Qwen2-7B-int4-inc)                                                                                                                                                                     |
+| Qwen/Qwen2-57B-A14B-Instruct           | [model-autoround-sym-int4](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc),[model-autogptq-sym-int4](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc)                                                                                                                                        |
 | 01-ai/Yi-1.5-9B                        | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-4bit-gptq-autoround)                                                                                                                                                                                                                                |
 | 01-ai/Yi-1.5-9B-Chat                   | [model-LnL-AI-autogptq-int4*](https://huggingface.co/LnL-AI/Yi-1.5-9B-Chat-4bit-gptq-autoround)                                                                                                                                                                                                                           |
 | Intel/neural-chat-7b-v3-3              | [model-autogptq-int4](https://huggingface.co/Intel/neural-chat-7b-v3-3-int4-inc)                                                                                                                                                                                                                                          |
@@ -283,7 +283,7 @@ release most of the models ourselves.
 | google/gemma-2b                        | [model-autogptq-int4](https://huggingface.co/Intel/gemma-2b-int4-inc)                                                                                                                                                                                                                                                     |
 | tiiuae/falcon-7b                       | [model-autogptq-int4-G64](https://huggingface.co/Intel/falcon-7b-int4-inc)                                                                                                                                                                                                                                                |
 | sapienzanlp/modello-italia-9b          | [model-fbaldassarri-autogptq-int4*](https://huggingface.co/fbaldassarri/modello-italia-9b-autoround-w4g128-cpu)                                                                                                                                                                                                           |
-| microsoft/phi-2                        | [model-autogptq-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc)                                                                                                                                                                                                                                                    |
+| microsoft/phi-2                        | [model-autoround-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc) [model-autogptq-sym-int4](https://huggingface.co/Intel/phi-2-int4-inc)                                                                                                                                                                            |
 | microsoft/Phi-3.5-mini-instruct        | [model-kaitchup-autogptq-sym-int4*](https://huggingface.co/kaitchup/Phi-3.5-Mini-instruct-AutoRound-4bit)                                                                                                                                                                                                                 |
 | microsoft/Phi-3-vision-128k-instruct   | [recipe](./examples/multimodal-modeling/Phi-3-vision/run_autoround.sh)                                                                                                                                                                                                                                                    
 | mistralai/Mistral-7B-Instruct-v0.2     | [accuracy](./docs/Mistral-7B-Instruct-v0.2-acc.md), [recipe](./examples/language-modeling/scripts/Mistral-7B-Instruct-v0.2.sh),  [example](./examples/language-modeling/)                                                                                                                                                 |