diff --git a/NEWS.md b/NEWS.md index ecdf0aa988..8e77711aa2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # torch (development version) +- Fix a bug on using input projection initialization bias in `nnf_multi_head_attention_forward` (#1154 @cregouby) - Bugfix: calling `$detach()` on a tensor now preserves attributes (#1136) - Make sure deep cloning of tensor and nn_module preserves class attributes and the requires_grad field. (#1129) - Fixed that parameters and buffers of children of nn_modules were not cloned diff --git a/R/nnf-activation.R b/R/nnf-activation.R index e1f470a96c..e5aa7603e9 100644 --- a/R/nnf-activation.R +++ b/R/nnf-activation.R @@ -486,12 +486,12 @@ nnf_threshold_ <- function(input, threshold, value) { #' #' @param embed_dim_to_check total dimension of the model. #' @param num_heads parallel attention heads. -#' @param in_proj_weight input projection weight and bias. +#' @param in_proj_weight input projection weight. #' @param bias_k bias of the key and value sequences to be added at dim=0. #' @param add_zero_attn add a new batch of zeros to the key and #' value sequences at dim=1. #' @param dropout_p probability of an element to be zeroed. -#' @param out_proj_weight the output projection weight and bias. +#' @param out_proj_weight the output projection weight. #' @param training apply dropout if is `TRUE`. #' @param key_padding_mask if provided, specified padding elements in the key will #' be ignored by the attention. This is an binary mask. When the value is True @@ -526,9 +526,9 @@ nnf_threshold_ <- function(input, threshold, value) { #' @param avg_weights Logical; whether to average attn_output_weights over the #' attention heads before outputting them. This doesn't change the returned #' value of attn_output; it only affects the returned attention weight matrix. -#' @param in_proj_bias currently undocumented. +#' @param in_proj_bias input projection bias. #' @param bias_v currently undocumented. -#' @param out_proj_bias currently undocumented. +#' @param out_proj_bias output projection bias. #' @param k_proj_weight currently undocumented. #' @param v_proj_weight currently undocumented. #' @param static_v currently undocumented. @@ -649,8 +649,8 @@ nnf_multi_head_attention_forward <- function(query, # type: Tensor } else { if (!is.null(in_proj_bias)) { q <- nnf_linear(query, q_proj_weight, in_proj_bias[1:embed_dim]) - k <- nnf_linear(key, k_proj_weight, in_proj_bias[embed_dim:(embed_dim * 2)]) - v <- nnf_linear(value, v_proj_weight, in_proj_bias[(embed_dim * 2):N]) + k <- nnf_linear(key, k_proj_weight, in_proj_bias[(embed_dim + 1):(embed_dim * 2)]) + v <- nnf_linear(value, v_proj_weight, in_proj_bias[(embed_dim * 2 + 1):N]) } else { q <- nnf_linear(query, q_proj_weight, in_proj_bias) k <- nnf_linear(key, k_proj_weight, in_proj_bias) diff --git a/man/nnf_multi_head_attention_forward.Rd b/man/nnf_multi_head_attention_forward.Rd index 9dd63c62c5..9068abe327 100644 --- a/man/nnf_multi_head_attention_forward.Rd +++ b/man/nnf_multi_head_attention_forward.Rd @@ -46,9 +46,9 @@ the embedding dimension. If batch_first is TRUE, the first two dimensions are tr \item{num_heads}{parallel attention heads.} -\item{in_proj_weight}{input projection weight and bias.} +\item{in_proj_weight}{input projection weight.} -\item{in_proj_bias}{currently undocumented.} +\item{in_proj_bias}{input projection bias.} \item{bias_k}{bias of the key and value sequences to be added at dim=0.} @@ -59,9 +59,9 @@ value sequences at dim=1.} \item{dropout_p}{probability of an element to be zeroed.} -\item{out_proj_weight}{the output projection weight and bias.} +\item{out_proj_weight}{the output projection weight.} -\item{out_proj_bias}{currently undocumented.} +\item{out_proj_bias}{output projection bias.} \item{training}{apply dropout if is \code{TRUE}.}