PaddlePaddle
diff --git a/‎csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h‎
Lines changed: 26 additions & 17 deletions b/‎csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h‎
Lines changed: 26 additions & 17 deletions
diff --git a/‎csrc/gpu/update_inputs_v2.cu‎
Lines changed: 22 additions & 4 deletions b/‎csrc/gpu/update_inputs_v2.cu‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎docs/zh/index.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/zh/index.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/zh/llm/devices/intel_hpu/tests/README.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/llm/devices/intel_hpu/tests/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/llm/docs/alignment_tutorial.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/llm/docs/alignment_tutorial.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/llm/docs/finetune_tutorial.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/llm/docs/finetune_tutorial.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/llm/docs/pretrain.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/llm/docs/pretrain.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/llm/docs/pretrain_tutorial.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/llm/docs/pretrain_tutorial.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/llm/docs/quantization_tutorial.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/llm/docs/quantization_tutorial.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po‎
Lines changed: 2 additions & 2 deletions b/‎docs/zh/locale/en/LC_MESSAGES/source/paddlenlp.taskflow.utils.po‎
Lines changed: 2 additions & 2 deletions
@@ -69,7 +69,8 @@ void generic_moe_gemm_kernelLauncher(const T* A,
                                      cudaStream_t stream,
                                      int* kernel_occupancy = nullptr) {
   if (gemm_config.split_k_style != SplitKStyle::NO_SPLIT_K) {
-    PADDLE_FATAL("[MoeGemm] Grouped gemm does not support split-k");
+    throw std::runtime_error(
+        "[MoeGemm] Grouped gemm does not support split-k");
   }
 
 #ifdef PADDLE_CUDA_BF16
@@ -169,7 +170,7 @@ void generic_moe_gemm_kernelLauncher(const T* A,
   int occupancy = std::min(2, GemmGrouped::maximum_active_blocks());
 
   if (occupancy == 0) {
-    PADDLE_FATAL(
+    throw std::runtime_error(
         "[MoE Runner] GPU lacks the shared memory resources to run "
         "GroupedGEMM kernel");
   }
@@ -197,7 +198,7 @@ void generic_moe_gemm_kernelLauncher(const T* A,
   if (can_implement != cutlass::Status::kSuccess) {
     std::string err_msg = "MoEFC kernel will fail for params. Error: " +
                           std::string(cutlassGetStatusString(can_implement));
-    PADDLE_FATAL("[MoE Runner] " + err_msg);
+    throw std::runtime_error("[MoE Runner] " + err_msg);
   }
 
   auto init_status = gemm.initialize(args);
@@ -243,7 +244,7 @@ struct dispatch_stages {
     std::string err_msg = "Cutlass fpA_intB gemm. Not instantiates for arch " +
                           std::to_string(arch::kMinComputeCapability) +
                           " with stages set to " + std::to_string(Stages);
-    PADDLE_FATAL("[dispatch_stages::dispatch] " + err_msg);
+    throw std::runtime_error("[dispatch_stages::dispatch] " + err_msg);
   }
 };
 
@@ -394,7 +395,8 @@ void dispatch_gemm_config(const T* A,
     default:
       std::string err_msg = "dispatch_gemm_config does not support stages " +
                             std::to_string(gemm_config.stages);
-      PADDLE_FATAL("[MoE][dispatch_gemm_config] " + err_msg);
+      throw std::runtime_error(
+          "[MoE][dispatch_gemm_config] " + err_msg);
       break;
   }
 }
@@ -452,15 +454,16 @@ void dispatch_moe_gemm_to_cutlass(const T* A,
     dispatch_gemm_config_macro(64, 128, 64, 32, 64, 64);
     dispatch_gemm_config_macro(128, 128, 64, 64, 32, 64);
     case CutlassTileConfig::Undefined:
-      PADDLE_FATAL("[dispatch_moe_gemm_to_cutlass] gemm config undefined.");
+      throw std::runtime_error(
+          "[dispatch_moe_gemm_to_cutlass] gemm config undefined.");
       break;
     case CutlassTileConfig::ChooseWithHeuristic:
-      PADDLE_FATAL(
+      throw std::runtime_error(
           "[dispatch_moe_gemm_to_cutlass] gemm config should have "
           "already been set by heuristic.");
       break;
     default:
-      PADDLE_FATAL(
+      throw std::runtime_error(
           "[dispatch_moe_gemm_to_cutlass] Config is invalid for same "
           "type MoE tensorop GEMM.");
       break;
@@ -497,38 +500,44 @@ void dispatch_moe_gemm_to_cutlass(const T* A,
       dispatch_gemm_config_macro(32, 128, 64, 32, 32, 64);
       dispatch_gemm_config_macro(64, 128, 64, 64, 64, 64);
       case CutlassTileConfig::Undefined:
-        PADDLE_FATAL("[dispatch_moe_gemm_to_cutlass] gemm config undefined.");
+        throw std::runtime_error(
+            "[dispatch_moe_gemm_to_cutlass] gemm config undefined.");
         break;
       case CutlassTileConfig::ChooseWithHeuristic:
-        PADDLE_FATAL(
+        throw std::runtime_error(
             "[dispatch_moe_gemm_to_cutlass] gemm config should have "
             "already been set by heuristic.");
         break;
       default:
-        PADDLE_FATAL(
+        throw std::runtime_error(
             "[dispatch_moe_gemm_to_cutlass] Config is invalid for "
             "mixed type tensorop GEMM.");
         break;
     }
   } else {
     switch (gemm_config.tile_config) {
       dispatch_gemm_config_macro(16, 128, 64, 16, 32, 64);
+      dispatch_gemm_config_macro(16, 256, 64, 16, 64, 64);
+      dispatch_gemm_config_macro(64, 64, 64, 32, 32, 64);
       dispatch_gemm_config_macro(32, 128, 64, 32, 32, 64);
+      dispatch_gemm_config_macro(128, 64, 64, 64, 32, 64);
       dispatch_gemm_config_macro(64, 128, 64, 64, 64, 64);
       dispatch_gemm_config_macro(128, 128, 64, 64, 64, 64);
       dispatch_gemm_config_macro(128, 128, 64, 128, 32, 64);
       dispatch_gemm_config_macro(128, 256, 64, 64, 64, 64);
       dispatch_gemm_config_macro(64, 128, 64, 64, 32, 64);
+      dispatch_gemm_config_macro(256, 128, 64, 64, 64, 64);
       case CutlassTileConfig::Undefined:
-        PADDLE_FATAL("[dispatch_moe_gemm_to_cutlass] gemm config undefined.");
+        throw std::runtime_error(
+            "[dispatch_moe_gemm_to_cutlass] gemm config undefined.");
         break;
       case CutlassTileConfig::ChooseWithHeuristic:
-        PADDLE_FATAL(
+        throw std::runtime_error(
             "[dispatch_moe_gemm_to_cutlass] gemm config should have "
             "already been set by heuristic.");
         break;
       default:
-        PADDLE_FATAL(
+        throw std::runtime_error(
             "[dispatch_moe_gemm_to_cutlass] Config is invalid for "
             "mixed type tensorop GEMM.");
         break;
@@ -561,17 +570,17 @@ void dispatch_moe_gemm_to_cutlass(const T* A,
   switch (gemm_config.tile_config) {
     dispatch_gemm_config_macro(128, 128, 8, 64, 64, 8);
     case CutlassTileConfig::Undefined:
-      PADDLE_FATAL(
+      throw std::runtime_error(
           "[dispatch_moe_gemm_to_cutlass][SIMT] gemm config "
           "undefined.");
       break;
     case CutlassTileConfig::ChooseWithHeuristic:
-      PADDLE_FATAL(
+      throw std::runtime_error(
           "[dispatch_moe_gemm_to_cutlass][SIMT] gemm config should "
           "have already been set by heuristic.");
       break;
     default:
-      PADDLE_FATAL(
+      throw std::runtime_error(
           "[dispatch_moe_gemm_to_cutlass][SIMT] Unsupported config "
           "for float MoE gemm.");
       break;
 
@@ -42,8 +42,10 @@ __global__ void update_inputs_kernel_v2(
     const int bsz,
     const int max_bsz,
     const int input_ids_stride,
-    const int end_length) {
+    const int end_length,
+    const int Flag_truncated_return_eos) {
   int thread_idx = threadIdx.x;
+  bool output_len_truncated = false;
   // update step_idx and stop_flags
   if (thread_idx < max_bsz) {
     bool stop_flag = stop_flags[thread_idx];
@@ -52,6 +54,7 @@ __global__ void update_inputs_kernel_v2(
     }
     if (step_idx[thread_idx] >= max_dec_len[thread_idx]) {
       stop_flags[thread_idx] = true;
+      output_len_truncated = true;
     }
   }
   __syncthreads();
@@ -61,8 +64,13 @@ __global__ void update_inputs_kernel_v2(
       if (seq_lens_this_time[thread_idx] == 0) {
         next_tokens[thread_idx] = -1;
       } else {
-        next_tokens[thread_idx] = end_ids[0];
-        kwargs_next_tokens[thread_idx] = end_ids[0];
+        if (!Flag_truncated_return_eos && output_len_truncated) {
+          // output len truncated will not return eos for rl.
+          kwargs_next_tokens[thread_idx] = next_tokens[thread_idx];
+        }else{
+          next_tokens[thread_idx] = end_ids[0];
+          kwargs_next_tokens[thread_idx] = end_ids[0];
+        }
       }
     } else {
       kwargs_next_tokens[thread_idx] = next_tokens[thread_idx];
@@ -127,6 +135,15 @@ void UpdateInputesV2(const paddle::Tensor& stop_flags,
   const int end_length = end_ids.shape()[0];
 
   auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
+  int Flag_truncated_return_eos = 1;
+  if (const char* inference_truncated_return_eos_env_p =
+          std::getenv("INFERENCE_TRUNCATED_RETURN_EOS")) {
+      std::string inference_truncated_return_eos_env_str(
+          inference_truncated_return_eos_env_p);
+      int inference_truncated_return_eos_from_env =
+          std::stoi(inference_truncated_return_eos_env_str);
+      Flag_truncated_return_eos = inference_truncated_return_eos_from_env;
+  }
 
   update_inputs_kernel_v2<1024><<<1, 1024, 0, input_ids.stream()>>>(
     const_cast<bool*>(not_need_stop_gpu.data<bool>()),
@@ -145,7 +162,8 @@ void UpdateInputesV2(const paddle::Tensor& stop_flags,
     now_bsz,
     max_bsz,
     input_ids_stride,
-    end_length
+    end_length,
+    Flag_truncated_return_eos
   );
 
   auto not_need_stop_cpu = not_need_stop_gpu.copy_to(not_need_stop.place(), false);
 
@@ -128,6 +128,10 @@
    :caption: 实践教程
 
    AI Studio Notebook <tutorials/overview>
+   大模型预训练新手指南 <llm/docs/pretrain_tutorial.md>
+   大模型精调新手指南 <llm/docs/finetune_tutorial.md>
+   大模型对齐新手指南 <llm/docs/alignment_tutorial.md>
+   大模型量化新手指南 <llm/docs/quantization_tutorial.md>
 
 .. toctree::
    :maxdepth: 1
 
@@ -0,0 +1 @@
+../../../../../../llm/devices/intel_hpu/tests/README.md
@@ -0,0 +1 @@
+../../../../llm/docs/alignment_tutorial.md
@@ -0,0 +1 @@
+../../../../llm/docs/finetune_tutorial.md
@@ -0,0 +1 @@
+../../../../llm/docs/pretrain.md
@@ -0,0 +1 @@
+../../../../llm/docs/pretrain_tutorial.md
@@ -0,0 +1 @@
+../../../../llm/docs/quantization_tutorial.md
@@ -276,7 +276,7 @@ msgid "word of current node."
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.BurkhardKellerTree:1
-msgid "Implementataion of BK-Tree"
+msgid "Implementation of BK-Tree"
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.BurkhardKellerTree.add:1
@@ -300,7 +300,7 @@ msgid "similar words."
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.TriedTree:1
-msgid "Implementataion of TriedTree"
+msgid "Implementation of TriedTree"
 msgstr ""
 
 #: of paddlenlp.taskflow.utils.TriedTree.add_word:1
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../../../llm/devices/intel_hpu/tests/README.md`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../llm/docs/alignment_tutorial.md`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../llm/docs/finetune_tutorial.md`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../llm/docs/pretrain_tutorial.md`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../llm/docs/quantization_tutorial.md`