NVIDIA · hsiehjackson · Oct 24, 2023 · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -219,6 +219,7 @@ def main(cfg) -> None:
             pretrained_cfg.activations_checkpoint_granularity = None
             pretrained_cfg.activations_checkpoint_method = None
             pretrained_cfg.precision = trainer.precision
+            pretrained_cfg["use_flash_attention"] = cfg.inference.get("use_flash_attention", False)
             if pretrained_cfg.get('mcore_gpt', False):
                 # with dist checkpointing we can use the model parallel config specified by the user
                 pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size

diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
@@ -89,6 +89,14 @@
         flash_attn_unpadded_func, flash_attn_func_triton, flash_attn_func = None, None, None
         unpad_input, pad_input = None, None
 
+    try:
+        # Flash Attention 2.2
+        from flash_attn import flash_attn_with_kvcache
+
+    except (ImportError, ModuleNotFoundError):
+
+        flash_attn_with_kvcache = None
+
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
@@ -144,6 +152,7 @@ def __init__(
         self.normalize_attention_scores = normalize_attention_scores
         self.position_embedding_type = position_embedding_type
         self.multi_query_attention = multi_query_attention
+        self.use_flash_attention = use_flash_attention
 
         self.megatron_legacy = megatron_legacy
         self.dtype = utils_funcs.torch_dtype_from_precision(precision, megatron_amp_O2)
@@ -503,7 +512,23 @@ def forward(
         if get_key_value:
             present = (key_layer, value_layer)
 
-        if checkpoint_core_attention:
+        if (
+            flash_attn_with_kvcache is not None
+            and self.use_flash_attention
+            and rotary_pos_emb is not None
+            and inference_max_sequence_len
+            and not set_inference_key_value_memory
+        ):
+            # Mainly used for decoding with sq=1
+            q = rearrange(apply_rotary_pos_emb(query_layer, rotary_pos_emb[0]), 'sq b np hn -> b sq np hn')
+            k = rearrange(apply_rotary_pos_emb(key_layer, rotary_pos_emb[1]), 'sk b np hn -> b sk np hn')
+            v = rearrange(value_layer, 'sk b np hn -> b sk np hn')
+            context_layer = flash_attn_with_kvcache(
+                q=q, k_cache=k, v_cache=v, causal=self.attn_mask_type == AttnMaskType.causal,
+            )
+            context_layer = rearrange(context_layer, 'b sq np hn -> sq b (np hn)')
+
+        elif checkpoint_core_attention:
             context_layer = self._checkpointed_attention_forward(
                 query_layer,
                 key_layer,