tracel-ai · nathanielsimard · Jan 31, 2024 · Jan 31, 2024
diff --git a/burn-core/src/nn/attention/mha.rs b/burn-core/src/nn/attention/mha.rs
@@ -13,7 +13,7 @@ use libm::sqrtf;
 /// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer.
 #[derive(Config)]
 pub struct MultiHeadAttentionConfig {
-    /// The size of the each linear layer.
+    /// The size of each linear layer.
     d_model: usize,
     /// The number of heads.
     n_heads: usize,
@@ -160,7 +160,7 @@ impl<B: Backend> MhaInput<B> {
 /// [Multihead attention](MultiHeadAttention) outputs.
 #[derive(Debug, Clone)]
 pub struct MhaOutput<B: Backend> {
-    /// The attention weights [batch_size, seq_length_1, seq_length_2].
+    /// The attention weights [batch_size, n_heads, seq_length_1, seq_length_2].
     pub weights: Tensor<B, 4>,
     /// The context tensor [batch_size, seq_length_1, d_model].
     pub context: Tensor<B, 3>,