PaddlePaddle · qingqing01 · Aug 15, 2023 · Jul 11, 2023 · Jul 13, 2023 · Jul 13, 2023
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -931,9 +931,8 @@ __global__ void masked_multihead_attention_kernel(
 
 #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
   if (bi == 0 && hi == 0 && tid == 0) {
-    printf("=======q_out=======\n");
-    for (int i = 0; i < Dh; ++i) printf("%f ", static_cast<float>(q_smem[i]));
-    printf("\n");
+    VLOG(0) << "=======q_out=======\n";
+    for (int i = 0; i < Dh; ++i) VLOG(0) << static_cast<float>(q_smem[i]);
   }
   __syncthreads();
 #endif

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -1541,6 +1541,17 @@
     data_type : logits
   backward : margin_cross_entropy_grad
 
+- op : masked_multihead_attention_
+  args : (Tensor x, Tensor bias, Tensor src_mask, Tensor sequence_lengths, Tensor rotary_tensor, Tensor beam_cache_offset, Tensor cache_kv, Tensor qkv_out_scale, Tensor out_linear_shift, Tensor out_linear_smooth, int beam_size, int rotary_emb_dims, bool mask_broadcast_num_heads=true, bool compute_bias=false, bool use_neox_rotary_style=false, float out_linear_in_scale=-1, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0)
+  output : Tensor(out), Tensor(cache_kv_out), Tensor(beam_cache_offset_out)
+  infer_meta :
+    func : MaskedMultiheadAttentionInferMeta
+  kernel :
+    func : masked_multihead_attention
+    data_type : cache_kv
+  optional : bias, src_mask, sequence_lengths, rotary_tensor, beam_cache_offset, qkv_out_scale, out_linear_shift, out_linear_smooth
+  inplace : (cache_kv -> cache_kv_out), (beam_cache_offset -> beam_cache_offset_out)
+
 - op : masked_select
   args : (Tensor x, Tensor mask)
   output : Tensor (out)

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -3624,5 +3624,73 @@ void WeightOnlyMatmulInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
+                                       const MetaTensor& bias,
+                                       const MetaTensor& src_mask,
+                                       const MetaTensor& sequence_lengths,
+                                       const MetaTensor& rotary_tensor,
+                                       const MetaTensor& beam_cache_offset,
+                                       const MetaTensor& cache_kv,
+                                       const MetaTensor& qkv_out_scale,
+                                       const MetaTensor& out_linear_shift,
+                                       const MetaTensor& out_linear_smooth,
+                                       int beam_size,
+                                       int rotary_emb_dims,
+                                       const bool mask_broadcast_num_heads,
+                                       const bool compute_bias,
+                                       const bool use_neox_rotary_style,
+                                       const float out_linear_in_scale,
+                                       const int quant_round_type,
+                                       const float quant_max_bound,
+                                       const float quant_min_bound,
+                                       MetaTensor* out,
+                                       MetaTensor* cache_kv_out,
+                                       MetaTensor* beam_cache_offset_out) {
+  auto x_dims = x.dims();
+  auto cache_kv_dims = cache_kv.dims();
+  auto x_dtype = x.dtype();
+  int bsz = x_dims[0];
+  int num_head = x_dims[2];
+  int dim_head = x_dims[3];
+
+  if (sequence_lengths) {
+    out->set_dims({bsz, num_head, dim_head});
+  } else {
+    out->set_dims({bsz, 1, num_head, dim_head});
+  }
+  if (out_linear_in_scale > 0) {
+    out->set_dtype(DataType::INT8);
+  } else {
+    out->set_dtype(x_dtype);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      4,
+      errors::InvalidArgument("The dimensions of x must be 4"
+                              "(batch_size, 3, num_head, dim_head),"
+                              "but received dimensions of"
+                              "Input is [%d]",
+                              x_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      cache_kv_dims.size(),
+      5,
+      errors::InvalidArgument("The cache_kv must be 5 dims, but got %d",
+                              cache_kv_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      cache_kv_dims[0],
+      2,
+      errors::InvalidArgument("The first dim of cache_kv must be 2, but got %d",
+                              cache_kv_dims[0]));
+
+  cache_kv_out->set_dims(cache_kv_dims);
+  cache_kv_out->set_dtype(cache_kv.dtype());
+
+  if (beam_cache_offset) {
+    beam_cache_offset_out->set_dims(beam_cache_offset.dims());
+    beam_cache_offset_out->set_dtype(beam_cache_offset.dtype());
+  }
+}
+
 }  // namespace phi
 PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta);
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -706,4 +706,27 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         MetaTensor* out_k,
                         MetaTensor* out_v);
 
+void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
+                                       const MetaTensor& bias,
+                                       const MetaTensor& src_mask,
+                                       const MetaTensor& sequence_lengths,
+                                       const MetaTensor& rotary_tensor,
+                                       const MetaTensor& beam_cache_offset,
+                                       const MetaTensor& cache_kv,
+                                       const MetaTensor& qkv_out_scale,
+                                       const MetaTensor& out_linear_shift,
+                                       const MetaTensor& out_linear_smooth,
+                                       int beam_size,
+                                       int rotary_emb_dims,
+                                       const bool mask_broadcast_num_heads,
+                                       const bool compute_bias,
+                                       const bool use_neox_rotary_style,
+                                       const float out_linear_in_scale,
+                                       const int quant_round_type,
+                                       const float quant_max_bound,
+                                       const float quant_min_bound,
+                                       MetaTensor* out,
+                                       MetaTensor* cache_kv_out,
+                                       MetaTensor* beam_cache_offset_out);
+
 }  // namespace phi