diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index e5e48cdc10da..c2e1f0ed48b7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -166,7 +166,7 @@ def forward(self, **kwargs): the superclass by the square root of the hidden size specified in the configuration. """ embeddings = super().forward(**kwargs) - return embeddings * (self.config.hidden_size ** 0.5) + return embeddings * torch.tensor(self.config.hidden_size ** 0.5, dtype=embeddings.dtype) class MegatronGPTExportableModel(torch.nn.Module, Exportable):