diff --git a/NEWS.md b/NEWS.md
index ecdf0aa988..8e77711aa2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,6 @@
 # torch (development version)
 
+- Fix a bug on using input projection initialization bias in `nnf_multi_head_attention_forward` (#1154 @cregouby)
 - Bugfix: calling `$detach()` on a tensor now preserves attributes (#1136)
 - Make sure deep cloning of tensor and nn_module preserves class attributes and the requires_grad field. (#1129)
 - Fixed that parameters and buffers of children of nn_modules were not cloned
diff --git a/R/nnf-activation.R b/R/nnf-activation.R
index e1f470a96c..e5aa7603e9 100644
--- a/R/nnf-activation.R
+++ b/R/nnf-activation.R
@@ -486,12 +486,12 @@ nnf_threshold_ <- function(input, threshold, value) {
 #'
 #' @param embed_dim_to_check  total dimension of the model.
 #' @param num_heads  parallel attention heads.
-#' @param in_proj_weight  input projection weight and bias.
+#' @param in_proj_weight  input projection weight.
 #' @param bias_k bias of the key and value sequences to be added at dim=0.
 #' @param add_zero_attn  add a new batch of zeros to the key and
 #'   value sequences at dim=1.
 #' @param dropout_p  probability of an element to be zeroed.
-#' @param out_proj_weight the output projection weight and bias.
+#' @param out_proj_weight the output projection weight.
 #' @param training  apply dropout if is `TRUE`.
 #' @param key_padding_mask  if provided, specified padding elements in the key will
 #'   be ignored by the attention. This is an binary mask. When the value is True
@@ -526,9 +526,9 @@ nnf_threshold_ <- function(input, threshold, value) {
 #' @param avg_weights Logical; whether to average attn_output_weights over the
 #'   attention heads before outputting them. This doesn't change the returned
 #'   value of attn_output; it only affects the returned attention weight matrix.
-#' @param in_proj_bias currently undocumented.
+#' @param in_proj_bias input projection bias.
 #' @param bias_v currently undocumented.
-#' @param out_proj_bias currently undocumented.
+#' @param out_proj_bias output projection bias.
 #' @param k_proj_weight currently undocumented.
 #' @param v_proj_weight currently undocumented.
 #' @param static_v currently undocumented.
@@ -649,8 +649,8 @@ nnf_multi_head_attention_forward <- function(query, # type: Tensor
   } else {
     if (!is.null(in_proj_bias)) {
       q <- nnf_linear(query, q_proj_weight, in_proj_bias[1:embed_dim])
-      k <- nnf_linear(key, k_proj_weight, in_proj_bias[embed_dim:(embed_dim * 2)])
-      v <- nnf_linear(value, v_proj_weight, in_proj_bias[(embed_dim * 2):N])
+      k <- nnf_linear(key, k_proj_weight, in_proj_bias[(embed_dim + 1):(embed_dim * 2)])
+      v <- nnf_linear(value, v_proj_weight, in_proj_bias[(embed_dim * 2 + 1):N])
     } else {
       q <- nnf_linear(query, q_proj_weight, in_proj_bias)
       k <- nnf_linear(key, k_proj_weight, in_proj_bias)
diff --git a/man/nnf_multi_head_attention_forward.Rd b/man/nnf_multi_head_attention_forward.Rd
index 9dd63c62c5..9068abe327 100644
--- a/man/nnf_multi_head_attention_forward.Rd
+++ b/man/nnf_multi_head_attention_forward.Rd
@@ -46,9 +46,9 @@ the embedding dimension. If batch_first is TRUE, the first two dimensions are tr
 
 \item{num_heads}{parallel attention heads.}
 
-\item{in_proj_weight}{input projection weight and bias.}
+\item{in_proj_weight}{input projection weight.}
 
-\item{in_proj_bias}{currently undocumented.}
+\item{in_proj_bias}{input projection bias.}
 
 \item{bias_k}{bias of the key and value sequences to be added at dim=0.}
 
@@ -59,9 +59,9 @@ value sequences at dim=1.}
 
 \item{dropout_p}{probability of an element to be zeroed.}
 
-\item{out_proj_weight}{the output projection weight and bias.}
+\item{out_proj_weight}{the output projection weight.}
 
-\item{out_proj_bias}{currently undocumented.}
+\item{out_proj_bias}{output projection bias.}
 
 \item{training}{apply dropout if is \code{TRUE}.}