diff --git a/src/transformers/adapters/model_mixin.py b/src/transformers/adapters/model_mixin.py index f82ee23e1e..9acd9a9691 100644 --- a/src/transformers/adapters/model_mixin.py +++ b/src/transformers/adapters/model_mixin.py @@ -153,6 +153,10 @@ def add_embeddings(self, name, tokenizer, reference_embedding=None, reference_to if embedding_dim is None: embedding_dim = self.config.hidden_size embedding = nn.Embedding(len(tokenizer), embedding_dim) + # Use same initialization as base Transformer model + embedding.weight.data.normal_(mean=0.0, std=0.02) + if embedding.padding_idx is not None: + embedding.weight.data[embedding.padding_idx].zero_() embedding.requires_grad_(False) if (reference_embedding is not None and reference_tokenizer is None) or ( reference_tokenizer is not None and reference_embedding is None