Fix module.training for neva in FusedAttn backward (NVIDIA#8877)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com> Co-authored-by: Pablo Garay <palenq@gmail.com>
rohitrango · Apr 15, 2024 · e0bbc54 · e0bbc54
1 parent d8ea1bc
commit e0bbc54
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 4 deletions.
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -782,7 +782,7 @@ class DataCollatorForSupervisedDataset(object):
 
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         max_len = max(instance['tokens'].shape[0] for instance in instances)
-        max_len = (max_len - 1) // 4 * 4 + 4
+        max_len = (max_len - 1) // 64 * 64 + 64
         for instance in instances:
             pad_len = max_len - instance['tokens'].shape[0]
             instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)

diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -387,9 +387,6 @@ def __init__(
     def freeze_llm(self, mm_cfg):
         for param in chain(self.embedding.parameters(), self.decoder.parameters(), self.output_layer.parameters(),):
             param.requires_grad = False
-        self.embedding = self.embedding.eval()
-        self.decoder = self.decoder.eval()
-        self.output_layer = self.output_layer.eval()
 
     def forward(
         self, *args, **kwargs,