nnf_multi_head_attention_forward fix indices of input projection in…

…itialization bias. (#1154) * fix indices translation from pytorch * add NEWS, improve documentation * better news.
mlverse · May 14, 2024 · 0508611 · 0508611
1 parent bf56dea
commit 0508611
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 10 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # torch (development version)
 
+- Fix a bug on using input projection initialization bias in `nnf_multi_head_attention_forward` (#1154 @cregouby)
 - Bugfix: calling `$detach()` on a tensor now preserves attributes (#1136)
 - Make sure deep cloning of tensor and nn_module preserves class attributes and the requires_grad field. (#1129)
 - Fixed that parameters and buffers of children of nn_modules were not cloned

diff --git a/R/nnf-activation.R b/R/nnf-activation.R
@@ -486,12 +486,12 @@ nnf_threshold_ <- function(input, threshold, value) {
 #'
 #' @param embed_dim_to_check  total dimension of the model.
 #' @param num_heads  parallel attention heads.
-#' @param in_proj_weight  input projection weight and bias.
+#' @param in_proj_weight  input projection weight.
 #' @param bias_k bias of the key and value sequences to be added at dim=0.
 #' @param add_zero_attn  add a new batch of zeros to the key and
 #'   value sequences at dim=1.
 #' @param dropout_p  probability of an element to be zeroed.
-#' @param out_proj_weight the output projection weight and bias.
+#' @param out_proj_weight the output projection weight.
 #' @param training  apply dropout if is `TRUE`.
 #' @param key_padding_mask  if provided, specified padding elements in the key will
 #'   be ignored by the attention. This is an binary mask. When the value is True
@@ -526,9 +526,9 @@ nnf_threshold_ <- function(input, threshold, value) {
 #' @param avg_weights Logical; whether to average attn_output_weights over the
 #'   attention heads before outputting them. This doesn't change the returned
 #'   value of attn_output; it only affects the returned attention weight matrix.
-#' @param in_proj_bias currently undocumented.
+#' @param in_proj_bias input projection bias.
 #' @param bias_v currently undocumented.
-#' @param out_proj_bias currently undocumented.
+#' @param out_proj_bias output projection bias.
 #' @param k_proj_weight currently undocumented.
 #' @param v_proj_weight currently undocumented.
 #' @param static_v currently undocumented.
@@ -649,8 +649,8 @@ nnf_multi_head_attention_forward <- function(query, # type: Tensor
   } else {
     if (!is.null(in_proj_bias)) {
       q <- nnf_linear(query, q_proj_weight, in_proj_bias[1:embed_dim])
-      k <- nnf_linear(key, k_proj_weight, in_proj_bias[embed_dim:(embed_dim * 2)])
-      v <- nnf_linear(value, v_proj_weight, in_proj_bias[(embed_dim * 2):N])
+      k <- nnf_linear(key, k_proj_weight, in_proj_bias[(embed_dim + 1):(embed_dim * 2)])
+      v <- nnf_linear(value, v_proj_weight, in_proj_bias[(embed_dim * 2 + 1):N])
     } else {
       q <- nnf_linear(query, q_proj_weight, in_proj_bias)
       k <- nnf_linear(key, k_proj_weight, in_proj_bias)

diff --git a/man/nnf_multi_head_attention_forward.Rd b/man/nnf_multi_head_attention_forward.Rd