NVIDIA · huvunvidia · Apr 23, 2024 · Jan 29, 2024 · Jan 31, 2024 · Feb 1, 2024
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml b/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
@@ -3,42 +3,40 @@ inference:
   top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
   top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
   temperature: 1.0 # sampling temperature
-  add_BOS: True # add the bos token at the begining of the prompt
+  add_BOS: False # add the bos token at the begining of the prompt
   tokens_to_generate: 30 # The minimum length of the sequence to be generated.
   all_probs: False  # whether return the log prob for all the tokens in vocab
   repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-
+  end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
+  # RETRO-specific arguments
+  retro_inference:
+    retro_gpt_retrieved_length: 128
+    retro_num_neighbors: 2
+    ft_neighbours: 0
+    reuse_top: False
 
 trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
   logger: False # logger provided by exp_manager
-  precision: 16 # 16, 32, or bf16
-
-inference_batch_size: 2
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+  
 tensor_model_parallel_size: -1
 pipeline_model_parallel_size: -1
 pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
-retro_model_file: null  # RETRO nemo file path
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
 
-use_predict_method: False  # whether to use the predict method
+retro_model_file: null  # Retro nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the Retro training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
 
-prompts: # prompts for RETRO model inference
-  - "hello,"
-  - "good morning,"
-  - "good afternoon,"
-  - "good evening,"
-
-########### Faiss service parameters ########
-retrieval_service:
-  strategy: RetroModelTextGenerationStrategy  # choose customized inference strategy 
-  neighbors: 4
-  frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
-  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
-  store_retrieved: False # whether store the retrieved documents, so it can be checked
-  combo_service:
-    service_ip: '0.0.0.0'
-    service_port: 17181 
+# RETRO inference
+prompt: "sample prompt"
+neighbors:
+  - "neighbor text 1"
+  - "neighbor text 2"
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_inference_legacy.yaml b/examples/nlp/language_modeling/conf/megatron_retro_inference_legacy.yaml
@@ -0,0 +1,46 @@
+# (This inferencing script for native NeMo RETRO will be soon deprecated. For new inferencing script for mcore RETRO, see ./megatron_retro_inference.yaml)
+
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 16 # 16, 32, or bf16
+
+inference_batch_size: 2
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+retro_model_file: null  # RETRO nemo file path
+
+use_predict_method: False  # whether to use the predict method
+
+prompts: # prompts for RETRO model inference
+  - "hello,"
+  - "good morning,"
+  - "good afternoon,"
+  - "good evening,"
+
+########### Faiss service parameters ########
+retrieval_service:
+  strategy: RetroModelTextGenerationStrategy  # choose customized inference strategy 
+  neighbors: 4
+  frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
+  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
+  store_retrieved: False # whether store the retrieved documents, so it can be checked
+  combo_service:
+    service_ip: '0.0.0.0'
+    service_port: 17181 
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_qatask.yaml b/examples/nlp/language_modeling/conf/megatron_retro_qatask.yaml
@@ -0,0 +1,40 @@
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: False # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
+  # RETRO-specific arguments
+  retro_inference:
+    retro_gpt_retrieved_length: 128
+    retro_num_neighbors: 2
+    ft_neighbours: 0
+    reuse_top: False
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+
+retro_model_file: null  # Retro nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the Retro training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+
+# qa tasks
+qa_file_path: null
+pred_file_path: null
diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,128 +12,119 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import os
 
-from examples.nlp.language_modeling.megatron_gpt_eval import RequestDataSet
-from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from torch.utils.data import DataLoader
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.utils.data import DataLoader, Dataset
 
-from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
 from nemo.core.config import hydra_runner
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
 
 """
-This is the script to run RETRO Model text generation.
+This is the script to run Retro text generation.
 
 Usage:
-    Assume the model has TP=1, PP=1
-    run greedy inference from a nemo file:
+    Currently, Mcore-based RETRO only support batch-size of 1.
+    Example running greedy inference from a distributed checkpoint dir:
         python megatron_retro_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT \
+            checkpoint_name=CHECKPOINT_NAME \
+            inference.greedy=True \
+            inference.add_BOS=False \
             trainer.devices=1 \
             trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.precision=16 \
-            inference.tokens_to_generate=128 \
-            inference.greedy=True \
-            retro_model_file=path_to_retro_nemo_file \
             tensor_model_parallel_size=-1 \
             pipeline_model_parallel_size=-1 \
-            retrieval_service.faiss_devices='0' \
-            retrieval_service.faiss_index=path_to_faiss_index \
-            retrieval_service.retrieval_index=path_to_retrieval_dataset \
-            retrieval_service.neighbors=20
-"""
+            prompt="sample prompt" \
+            inference.retro_inference.retro_num_neighbors=2 \
+            neighbors=["neighbor text 1", "neighbor text 2"]
 
 
-@hydra_runner(config_path="conf", config_name="megatron_retro_inference")
-def main(cfg) -> None:
-    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+        ```
+"""
 
-    model_path = cfg.retro_model_file
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
 
-    save_restore_connector = NLPSaveRestoreConnector()
 
-    if os.path.isdir(model_path):
-        save_restore_connector.model_extracted_dir = model_path
+class RequestDataSet(Dataset):
+    def __init__(self, sentences, neighbors):
+        super().__init__()
+        self.sentences = sentences
+        self.neighbors = neighbors
 
-    model_cfg = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, return_config=True, save_restore_connector=save_restore_connector,
-    )
+    def __len__(self,):
+        return len(self.sentences)
 
-    with open_dict(model_cfg):
-        model_cfg.precision = trainer.precision
-        model_cfg.sequence_parallel = False
-        model_cfg.activations_checkpoint_granularity = None
-        model_cfg.activations_checkpoint_method = None
-
-    if (
-        cfg.tensor_model_parallel_size < 0
-        or cfg.pipeline_model_parallel_size < 0
-        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
-    ):
-        with open_dict(cfg):
-            cfg.tensor_model_parallel_size = model_cfg.get('tensor_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_size = model_cfg.get('pipeline_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_split_rank = model_cfg.get('pipeline_model_parallel_split_rank', 0)
-
-    model = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, save_restore_connector=save_restore_connector, override_config_path=model_cfg,
-    )
+    def __getitem__(self, idx):
+        return {'prompts': self.sentences[idx], 'neighbors': self.neighbors[idx]}
 
-    length_params: LengthParam = {
-        "max_length": cfg.inference.tokens_to_generate,
-        "min_length": cfg.inference.min_tokens_to_generate,
-    }
 
-    sampling_params: SamplingParam = {
-        "use_greedy": cfg.inference.greedy,
-        "temperature": cfg.inference.temperature,
-        "top_k": cfg.inference.top_k,
-        "top_p": cfg.inference.top_p,
-        "repetition_penalty": cfg.inference.repetition_penalty,
-        "add_BOS": cfg.inference.add_BOS,
-        "all_probs": cfg.inference.all_probs,
-        "compute_logprob": cfg.inference.compute_logprob,
-    }
+@hydra_runner(config_path="conf", config_name="megatron_retro_inference")
+def main(cfg) -> None:
+
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=[CustomProgressBar()],
+    )
 
-    # check whether the DDP is initialized
-    if not parallel_state.is_initialized():
+    if cfg.checkpoint_dir:
+        app_state = AppState()
+        if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
+            app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+            app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+            (
+                app_state.tensor_model_parallel_rank,
+                app_state.pipeline_model_parallel_rank,
+                app_state.model_parallel_size,
+                app_state.data_parallel_size,
+                app_state.pipeline_model_parallel_split_rank,
+                app_state.virtual_pipeline_model_parallel_rank,
+            ) = fake_initialize_model_parallel(
+                world_size=app_state.model_parallel_size,
+                rank=trainer.global_rank,
+                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+            )
+        checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+        model = MegatronRetroModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer
+        )
+    else:
+        raise ValueError("Requiring distributed checkpoint dir for loading Mcore RETRO.")
 
-        def dummy():
-            return
+    model.freeze()
 
-        if model.trainer.strategy.launcher is not None:
-            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-        model.trainer.strategy.setup_environment()
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
 
+    prompt = [cfg.prompt]
+    neighbors = [cfg.neighbors]
+    ds = RequestDataSet(prompt, neighbors)
+    bs = 1
+    request_dl = DataLoader(dataset=ds, batch_size=bs)
     config = OmegaConf.to_container(cfg.inference)
-    retrieval_service = OmegaConf.to_container(cfg.retrieval_service)
-    model.set_inference_config(config, retrieval_service)
-
-    if not cfg.use_predict_method:
-        # First method of running text generation, call model.generate method
-        response = model.generate(
-            inputs=OmegaConf.to_container(cfg.prompts),
-            length_params=length_params,
-            sampling_params=sampling_params,
-            strategy=model.inference_strategy,
-        )
-    else:
-        # Second method of running text generation, call trainer.predict
-        ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
-        request_dl = DataLoader(dataset=ds, batch_size=cfg.inference_batch_size)
-        response = trainer.predict(model, request_dl)
+    model.set_inference_config(config)
+
+    response = trainer.predict(model, request_dl)
 
     print("***************************")
     print(response)