Fix P-tuning for Llama based models (#9300)

* Fix P-tuning for Llama based models (#9297) * Added the BOS token for Llama, Mistral and Mixtral. Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com> * Don't load an existing TRT-LLM model before export to speed up the export process and avoid possible contamination from previous runs. Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com> * Apply isort and black reformatting Signed-off-by: apanteleev <apanteleev@users.noreply.github.com> --------- Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com> Signed-off-by: apanteleev <apanteleev@users.noreply.github.com> Co-authored-by: apanteleev <apanteleev@users.noreply.github.com> Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> * Fix the export test --------- Signed-off-by: Alexey Panteleev <alpanteleev@nvidia.com> Signed-off-by: apanteleev <apanteleev@users.noreply.github.com> Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: Alexey Panteleev <alpanteleev@nvidia.com> Co-authored-by: apanteleev <apanteleev@users.noreply.github.com> Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
NVIDIA · May 30, 2024 · b6595cb · b6595cb
1 parent 2e39606
commit b6595cb
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 3 deletions.
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -312,7 +312,13 @@ def load(
 
     max_batch_size = config["build_config"]["max_batch_size"]
     max_input_len = config["build_config"]["max_input_len"]
-    add_bos = True if config["pretrained_config"]["architecture"] == "GemmaForCausalLM" else False
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
 
     return TensorrtLLMHostContext(
         executor=executor,

diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
@@ -216,6 +216,7 @@ def nemo_deploy(argv):
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.nemo_checkpoint is None),
         use_python_runtime=(not args.use_cpp_runtime),
     )
 

diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
@@ -131,7 +131,7 @@ def nemo_export_trt_llm(argv):
         return
 
     try:
-        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository, load_model=False)
 
         LOGGER.info("Export to TensorRT-LLM function is called.")
         trt_llm_exporter.export(

diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
@@ -200,7 +200,7 @@ def run_trt_llm_inference(
                 print("---- LoRA could not be enabled and skipping the test.")
                 return None, None, None, None, None
 
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
 
         trt_llm_exporter.export(
             nemo_checkpoint_path=checkpoint_path,