Additional changes - now working on char-level TinyShakespeare

* Add missing input LayerNorm to spec (in the default attention spec it's fused with the projection Linear layer, so not explicitly defined) * Shape conversion at start and end of Hyena forward
guyjacob · May 2, 2024 · c9ccc1a · c9ccc1a
1 parent f91a5ba
commit c9ccc1a
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 0 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml
@@ -95,6 +95,8 @@ model:
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
   spec_overrides:
+    - submodule: 'submodules.input_layernorm'
+      class: 'megatron.core.transformer.custom_layers.transformer_engine.TENorm'
     - submodule: 'submodules.self_attention'
       class: 'nemo.collections.nlp.modules.common.hyena.HyenaOperator'
       config_key: 'hyena'

diff --git a/nemo/collections/nlp/modules/common/hyena/hyena.py b/nemo/collections/nlp/modules/common/hyena/hyena.py
@@ -335,6 +335,9 @@ def recurrence(self, u, state):
         raise NotImplementedError("Working on it!")
 
     def forward(self, u, *args, **kwargs):
+        # In MCore the leading dimension is the sequence dimension
+        u = rearrange(u, 'l b d -> b l d')
+
         l = u.size(-2)
         l_filter = min(l, self.l_max)
         u = self.in_proj(u)
@@ -378,6 +381,9 @@ def forward(self, u, *args, **kwargs):
         y = self.activation(rearrange(v * x[0], 'b h v z l -> b (z l) (h v)', z=self.num_blocks, h=self.num_heads))
         y = self.out_proj(y)
 
+        # Convert back to sequence-first for MCore
+        y = rearrange(y, 'b l h -> l b h')
+
         # MCore TransformerLayer expects tuple where 2nd element represents the bias, it can be None
         return y, None