TRT-LLM 0.10 Update (NVIDIA#9402)

* reorg the export code Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * replaced log with raise Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * add converter and loader folders Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * move nemo_ckpt_convert into the converter folder Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * move nemo_file into loader folder Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * reorg converter Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * continue to reorg converter Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * continue to reorg Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * move nemo file back into nemo folder Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * renamed nemo folder to nemo_ckpt_loader Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * remove unused function Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * removed nemo file Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * moved a function to tensorrt_llm_run file Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * Remove unused imports Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * import csv added Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * update the APIs Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * add use_embedding_sharing param Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * do not add unused inputs during MG export Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * add cpp runtime test Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * sharing embedding * Remove manually scaling * renaming to avoid nemo github issue Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> --------- Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: Bobby Chen <bobchen@nvidia.com>
JesusPaz · Jun 18, 2024 · a5d7c85 · a5d7c85
1 parent 5fe2e14
commit a5d7c85
Show file tree

Hide file tree

Showing 6 changed files with 106 additions and 46 deletions.
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -121,13 +121,15 @@ def export(
         n_gpus: int = 1,
         tensor_parallel_size: int = None,
         pipeline_parallel_size: int = None,
+        gpus_per_node: int = None,
         max_input_len: int = 256,
         max_output_len: int = 256,
         max_input_token: Optional[int] = None,
         max_output_token: Optional[int] = None,
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
         use_parallel_embedding: bool = False,
+        use_embedding_sharing: bool = False,
         paged_kv_cache: bool = True,
         remove_input_padding: bool = True,
         dtype: str = "bfloat16",
@@ -150,13 +152,15 @@ def export(
             n_gpus (int): number of GPUs to use for inference.
             tensor_parallel_size (int): tensor parallelism.
             pipeline_parallel_size (int): pipeline parallelism.
+            gpus_per_node (int): number of gpus per node.
             max_input_len (int): max input length.
             max_output_len (int): max output length.
             max_input_token (int): max input length. Deprecated, use max_input_len instead.
             max_output_token (int): max output length. Deprecated, use max_output_len instead.
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
             use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
+            use_embedding_sharing (bool):
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
             remove_input_padding (bool): enables removing input padding or not.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
@@ -173,7 +177,7 @@ def export(
         if model_type not in self.get_supported_models_list:
             raise Exception(
                 "Model {0} is not currently a supported model type. "
-                "Supported model types are llama, gptnext, falcon, and starcoder".format(model_type)
+                "Supported model types are llama, gptnext, falcon, and starcoder.".format(model_type)
             )
 
         if model_type == "gpt" or model_type == "starcoder":
@@ -189,6 +193,8 @@ def export(
             tensor_parallel_size = 1
             pipeline_parallel_size = n_gpus
 
+        gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
+
         if Path(self.model_dir).exists():
             if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
                 for files in os.listdir(self.model_dir):
@@ -267,7 +273,9 @@ def export(
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     pipeline_parallel_size=pipeline_parallel_size,
+                    gpus_per_node=gpus_per_node,
                     use_parallel_embedding=use_parallel_embedding,
+                    use_embedding_sharing=use_embedding_sharing,
                 )
 
                 for weight_dict, model_config in zip(weights_dicts, model_configs):

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
@@ -72,9 +72,17 @@ def model_to_trtllm_ckpt(
     dtype: str = "bfloat16",
     tensor_parallel_size: int = 1,
     pipeline_parallel_size: int = 1,
+    gpus_per_node: int = None,
     use_parallel_embedding: bool = False,
+    use_embedding_sharing: bool = False,
 ) -> Tuple[List[Dict], List[PretrainedConfig]]:
 
+    if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing:
+        LOGGER.info(
+            "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True"
+        )
+        use_embedding_sharing = True
+
     weights_dict = convert_model_to_trt_llm_ckpt(
         model=model,
         nemo_model_config=nemo_model_config,
@@ -88,12 +96,14 @@ def model_to_trtllm_ckpt(
 
     world_size = tensor_parallel_size * pipeline_parallel_size
 
-    lm_head_weight = weights_dict["lm_head.weight"]
+    has_lm_head = "lm_head.weight" in weights_dict
+    if has_lm_head:
+        lm_head_weight = weights_dict["lm_head.weight"]
 
     vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
-    vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size)
+    vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size
 
-    if vocab_size_padded != vocab_size:
+    if has_lm_head and vocab_size_padded != vocab_size:
         pad_width = vocab_size_padded - vocab_size
         lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
 
@@ -120,7 +130,7 @@ def model_to_trtllm_ckpt(
         'hidden_act': hidden_act,
         'use_parallel_embedding': use_parallel_embedding,
         'embedding_sharding_dim': 0,
-        'share_embedding_table': False,
+        'share_embedding_table': use_embedding_sharing,
         'quantization': {
             'quant_algo': None,
             'kv_cache_quant_algo': None,
@@ -160,9 +170,15 @@ def model_to_trtllm_ckpt(
         "transformer.ln_f.bias",
     }
 
+    gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
+
     for i in range(world_size):
         mapping = tensorrt_llm.Mapping(
-            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
+            world_size=world_size,
+            rank=i,
+            tp_size=tensor_parallel_size,
+            pp_size=pipeline_parallel_size,
+            gpus_per_node=gpus_per_node,
         )
         layers_range = mapping.pp_layers(num_layers)
 
@@ -174,6 +190,8 @@ def model_to_trtllm_ckpt(
             if new_key.endswith(".bin"):  # TP split
                 if new_key.endswith(f"{mapping.tp_rank}.bin"):
                     new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
+                else:
+                    continue
             if "layers" in new_key:  # PP
                 layer_num = int(new_key.split(".")[2])
                 if layer_num in layers_range:
@@ -202,15 +220,17 @@ def model_to_trtllm_ckpt(
                 weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
 
         if mapping.is_last_pp_rank():
-            weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
-                split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
-            )
+            if has_lm_head:
+                weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
+                    split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
+                )
             weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
 
             ln_f_bias = weights_dict.get("transformer.ln_f.bias")
             if ln_f_bias is not None:
                 weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
 
+        config["gpus_per_node"] = gpus_per_node
         model_config = PretrainedConfig(**config)
         model_config.mapping = mapping
         model_configs.append(model_config)

diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -158,8 +158,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
                 model_level_weights["transformer.position_embedding.weight"].append(val)
         if pp_idx == 0:
             val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-            if embedding_scaling:
-                val = val * float(math.sqrt(hidden_size))
 
             vocab_size = val.shape[0]
             if use_parallel_embedding:
@@ -171,10 +169,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
 
             val = torch_to_numpy(val.to(storage_type).cpu())
             model_level_weights["transformer.vocab_embedding.weight"].append(val)
-            if share_embeddings_and_output:
-                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-                val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["lm_head.weight"].append(val)
         if has_lm_head and pp_idx == training_pp_size - 1:
             val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
             val = torch_to_numpy(val.to(storage_type).cpu())

diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -19,7 +19,7 @@
 from tensorrt_llm.builder import BuildConfig, Builder
 from tensorrt_llm.commands.build import build as build_trtllm
 from tensorrt_llm.logger import logger
-from tensorrt_llm.lora_manager import LoraBuildConfig
+from tensorrt_llm.lora_manager import LoraConfig
 from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
 from tensorrt_llm.plugin import PluginConfig
 
@@ -94,7 +94,7 @@ def build_and_save_engine(
 
     if use_lora_plugin is not None:
         build_config.plugin_config.set_lora_plugin(use_lora_plugin)
-        lora_config = LoraBuildConfig(
+        lora_config = LoraConfig(
             lora_dir=lora_ckpt_list,
             lora_ckpt_source='nemo',
             max_lora_rank=max_lora_rank,

diff --git a/tests/export/test_nemo_export.py → tests/export/nemo_export.py b/tests/export/test_nemo_export.py → tests/export/nemo_export.py
@@ -128,6 +128,7 @@ def run_trt_llm_inference(
     trt_llm_model_dir,
     n_gpu=1,
     max_batch_size=8,
+    use_embedding_sharing=False,
     max_input_len=128,
     max_output_len=128,
     ptuning=False,
@@ -216,6 +217,7 @@ def run_trt_llm_inference(
             lora_target_modules=lora_target_modules,
             max_num_tokens=int(max_input_len * max_batch_size * 0.2),
             opt_num_tokens=60,
+            use_embedding_sharing=use_embedding_sharing,
             save_nemo_model_config=True,
         )
 
@@ -237,6 +239,14 @@ def run_trt_llm_inference(
             stop_words_list=stop_words_list,
         )
 
+        if not use_lora_plugin and not ptuning:
+            test_cpp_runtime(
+                engine_path=trt_llm_model_dir,
+                prompt=prompt,
+                max_output_len=max_output_len,
+                debug=True,
+            )
+
         nq = None
         nm = None
         output_deployed = ""
@@ -290,6 +300,27 @@ def run_trt_llm_inference(
         raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
 
 
+def test_cpp_runtime(
+    engine_path,
+    prompt,
+    max_output_len,
+    debug,
+):
+    trt_llm_exporter = TensorRTLLM(engine_path, load_model=True)
+    output = trt_llm_exporter.forward(
+        input_texts=prompt,
+        max_output_len=max_output_len,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+    )
+
+    if debug:
+        print("")
+        print("--- Output deployed with cpp runtime: ", output)
+        print("")
+
+
 def run_existing_checkpoints(
     model_name,
     n_gpus,
@@ -332,6 +363,12 @@ def run_existing_checkpoints(
         else:
             raise Exception("There is not lora checkpoint path defined.")
 
+    if model_info["model_type"] == "gemma":
+        print("*********************")
+        use_embedding_sharing = True
+    else:
+        use_embedding_sharing = False
+
     return run_trt_llm_inference(
         model_name=model_name,
         model_type=model_info["model_type"],
@@ -340,6 +377,7 @@ def run_existing_checkpoints(
         trt_llm_model_dir=model_info["trt_llm_model_dir"],
         n_gpu=n_gpus,
         max_batch_size=model_info["max_batch_size"],
+        use_embedding_sharing=use_embedding_sharing,
         max_input_len=512,
         max_output_len=model_info["max_output_len"],
         ptuning=ptuning,

diff --git a/tests/export/run.sh b/tests/export/run.sh
@@ -20,32 +20,32 @@ for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
 set +x
 
 
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1