From 5df30e1e28de487db565d7fdc69b15ad7bf25fd1 Mon Sep 17 00:00:00 2001
From: Bhuvanesh Sridharan <bhuvanesh.sridharan@sprinklr.com>
Date: Thu, 19 Sep 2024 16:24:00 +0530
Subject: [PATCH 1/2] fix: fixing use of custom calibration dataset for
 smoothquant in llama

---
 examples/llama/convert_checkpoint.py | 14 ++++++++++++++
 tensorrt_llm/models/convert_utils.py |  4 ++--
 tensorrt_llm/models/llama/convert.py | 10 ++++++++--
 tensorrt_llm/models/llama/model.py   |  4 +++-
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py
index 820db5b2d..f97edcb04 100644
--- a/examples/llama/convert_checkpoint.py
+++ b/examples/llama/convert_checkpoint.py
@@ -91,6 +91,18 @@ def parse_arguments():
         help=
         "The huggingface dataset name or the local directory of the dataset for calibration."
     )
+    parser.add_argument(
+        "--calib_size",
+        type=int,
+        default=512,
+        help="Number of datapoints to use for calibration. Default is 512. Set to -1 to use the whole dataset.",
+    )
+    parser.add_argument(
+        "--calib_max_seq_length",
+        type=int,
+        default=512,
+        help="Max Sequence length to use for calibration. Default is 512.",
+    )
     parser.add_argument(
         "--smoothquant",
         "-sq",
@@ -402,6 +414,8 @@ def convert_and_save_hf(args):
             quant_config=quant_config,
             device='cpu' if args.load_model_on_cpu else 'cuda',
             calib_dataset=args.calib_dataset,
+            calib_batches=args.calib_size,
+            calib_max_seq_length=args.calib_max_seq_length,
             **override_fields)
     else:
         # When not loading by shard, preload one complete model and then slice per rank weights from this
diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py
index be587fd5f..c81518178 100644
--- a/tensorrt_llm/models/convert_utils.py
+++ b/tensorrt_llm/models/convert_utils.py
@@ -254,8 +254,8 @@ def has_safetensors(model_dir: str):
 
 def load_calib_dataset(dataset_name_or_dir: str,
                        config_name: Optional[str] = None,
-                       split: Optional[str] = None,
-                       key: Optional[str] = None,
+                       split: Optional[str] = "train", # default split value in hf datasets object
+                       key: Optional[str] = "text", # default key value in hf datasets object
                        trust_remote_code=True,
                        **kwargs):
     if config_name is None:
diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py
index a1f3b7c38..7788a55c9 100644
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@@ -1084,7 +1084,10 @@ def quantize(hf_model_dir: str,
              output_dir: str,
              config: LLaMAConfig,
              device: str = 'cuda',
-             calib_dataset: str = 'cnn_dailymail'):
+             calib_dataset: str = 'cnn_dailymail',
+             calib_batches: int = 512,
+             calib_max_seq_length: int = 512,
+             ):
     '''
         Quantize the save the model as TRT-LLM checkpoint to output_dir
     '''
@@ -1118,7 +1121,10 @@ def quantize(hf_model_dir: str,
 
     dataset = load_calib_dataset(calib_dataset)
 
-    act_range = capture_activation_range(hf_model, tokenizer, dataset)
+    if calib_batches == -1: # use the whole dataset if calib_batches is -1
+        calib_batches = len(dataset)
+
+    act_range = capture_activation_range(hf_model, tokenizer, dataset, num_samples=calib_batches, seq_len=calib_max_seq_length)
     qkv_para, smoother = {}, {}
     if use_smooth_quant:
         smooth_llama_model(hf_model, act_range, quant_config.smoothquant_val,
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index 93111df4c..352bc33d7 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -446,7 +446,9 @@ def quantize(
                              output_dir,
                              config=config,
                              device=device,
-                             calib_dataset=calib_dataset)
+                             calib_dataset=calib_dataset,
+                             calib_batches=calib_batches,
+                             calib_max_seq_length=calib_max_seq_length,)
         else:
             raise ValueError(
                 f"The quant_config ({quant_config}) does not require calibration, try {cls.__name__}.from_hugging_face instead."

From 67aa530ae5971467557dff40ff0409efda78b0c5 Mon Sep 17 00:00:00 2001
From: Bhuvanesh Sridharan <bhuvanesh.sridharan@sprinklr.com>
Date: Sat, 28 Sep 2024 11:48:28 +0530
Subject: [PATCH 2/2] fix: Correctly setting default split and key without
 overriding the values in DEFAULT_HF_DATASET_META

---
 tensorrt_llm/models/convert_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py
index c81518178..980ac32ab 100644
--- a/tensorrt_llm/models/convert_utils.py
+++ b/tensorrt_llm/models/convert_utils.py
@@ -249,13 +249,14 @@ def has_safetensors(model_dir: str):
     'ccdv/cnn_dailymail': ('3.0.0', 'train', 'article'),
     'cnn_dailymail': ('3.0.0', 'train', 'article'),
     'lambada': (None, 'validation', 'text'),
+    '': (None, 'train', 'text'),
 }
 
 
 def load_calib_dataset(dataset_name_or_dir: str,
                        config_name: Optional[str] = None,
-                       split: Optional[str] = "train", # default split value in hf datasets object
-                       key: Optional[str] = "text", # default key value in hf datasets object
+                       split: Optional[str] = None, # default split value will be 'train' in hf datasets object
+                       key: Optional[str] = None, # default key value will be 'text' in hf datasets object
                        trust_remote_code=True,
                        **kwargs):
     if config_name is None: