PaddlePaddle
diff --git a/‎.github/actions/check-bypass/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/check-bypass/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/Api-Benchmark-baseline.yml‎
Lines changed: 15 additions & 5 deletions b/‎.github/workflows/Api-Benchmark-baseline.yml‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎.github/workflows/_Api-Benchmark.yml‎
Lines changed: 5 additions & 3 deletions b/‎.github/workflows/_Api-Benchmark.yml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎.github/workflows/check-bypass.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/check-bypass.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/gtest.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/external/gtest.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp‎
Lines changed: 4 additions & 4 deletions b/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu‎
Lines changed: 31 additions & 5 deletions b/‎paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu‎
Lines changed: 31 additions & 5 deletions
diff --git a/‎paddle/phi/backends/custom/custom_context.h‎
Lines changed: 0 additions & 6 deletions b/‎paddle/phi/backends/custom/custom_context.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎paddle/phi/infermeta/unary.cc‎
Lines changed: 4 additions & 2 deletions b/‎paddle/phi/infermeta/unary.cc‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/gpu/weight_quantize_kernel.cu‎
Lines changed: 8 additions & 1 deletion b/‎paddle/phi/kernels/gpu/weight_quantize_kernel.cu‎
Lines changed: 8 additions & 1 deletion
@@ -22,7 +22,7 @@ runs:
       uses: PFCCLab/ci-bypass@v1
       with:
         github-token: ${{ inputs.github-token }}
-        non-pull-request-event-strategy: 'always-skipped'
+        non-pull-request-event-strategy: 'never-skipped'
         type: 'composite'
         composite-rule: |
           {
 
@@ -29,15 +29,13 @@ defaults:
 jobs:
   clone:
     name: Api benchmark clone
-    if: github.event.schedule == '0 21 * * *'
     uses: ./.github/workflows/_Clone-linux.yml
     with:
       clone_dir: Paddle-build
       is_pr: 'false'
 
   build-docker:
     name: Api benchmark build docker
-    if: github.event.schedule == '0 21 * * *'
     needs: clone
     uses: ./.github/workflows/docker.yml
     with:
@@ -46,21 +44,33 @@ jobs:
 
   build:
     name: Api benchmark build
-    if: github.event.schedule == '0 21 * * *'
+    if: github.event_name == 'schedule' && github.event.schedule == '0 21 * * *'
     needs: [clone, build-docker]
     uses: ./.github/workflows/_Linux-build.yml
     with:
       docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
       is_pr: 'false'
 
-  api-benchmark-baseline:
+  api-benchmark-baseline-schedule:
     name: Api benchmark baseline
-    if: github.event.schedule == '0 21 * * *' || github.event.inputs.job-name == 'api-benchmark'
     strategy:
       matrix:
         run-labels: [api-bm-20, api-bm-27]
     uses: ./.github/workflows/_Api-Benchmark.yml
     needs: [clone, build-docker, build]
+    with:
+      docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
+      baseline: 'true'
+      run-labels: ${{ matrix.run-labels }}
+
+  api-benchmark-baseline-pr:
+    name: Api benchmark baseline
+    if: github.event_name == 'workflow_dispatch' && github.event.inputs.job-name == 'api-benchmark'
+    strategy:
+      matrix:
+        run-labels: [api-bm-20, api-bm-27]
+    uses: ./.github/workflows/_Api-Benchmark.yml
+    needs: [clone, build-docker]
     with:
       docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
       baseline: 'true'
 
@@ -133,15 +133,17 @@ jobs:
           cd ./PaddleTest/framework/e2e/api_benchmark_new
           cp /paddle/PTSTools/Uploader/apibm_config.yml .
           source ${{ github.workspace }}/../../../proxy
-          ${python} -m pip install $wheel_link
           if [[ "${{ inputs.baseline }}" == "true" ]];then
             if [[ "${{ inputs.MANUALLY_PR_ID }}" == "" ]]; then
-              ${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link https://paddle-github-action.bj.bcebos.com/PR/build/${{ github.event.pull_request.number }}/${{ github.event.pull_request.head.sha }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
+              export pr_wheel_link=https://paddle-github-action.bj.bcebos.com/PR/build/${{ github.event.pull_request.number }}/${{ github.event.pull_request.head.sha }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
             else
-              ${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
+              export pr_wheel_link=https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
             fi
+            ${python} -m pip install $pr_wheel_link
+            ${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link $pr_wheel_link
             exit 0
           fi
+          ${python} -m pip install $wheel_link
           if [ ${core_index} -eq -1 ];then
             ${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --core_index 2
           else
 
@@ -33,7 +33,7 @@ jobs:
         uses: PFCCLab/ci-bypass@v1
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          non-pull-request-event-strategy: 'always-skipped'
+          non-pull-request-event-strategy: 'never-skipped'
           type: 'composite'
           composite-rule: |
             {
 
@@ -126,6 +126,7 @@ else()
                -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+               -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
                -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                -DBUILD_GMOCK=ON
                -Dgtest_disable_pthreads=ON
 
@@ -1685,11 +1685,11 @@ Buffer::low_latency_dispatch(
   EP_HOST_ASSERT(!(async && return_recv_hook));
   if (!return_recv_hook) stream_wait(launch_stream, compute_stream);
 
-  EP_HOST_ASSERT(
-      !(expertwise_scale.has_value() && use_fp8) &&
-      "expertwise_scale and use_fp8 can not arise at the same time.");
   auto return_x_dtype = phi::DataType::BFLOAT16;
   if (use_fp8) {
+    if (expertwise_scale.has_value()) {
+      EP_HOST_ASSERT(expertwise_scale.value().size(0) == num_experts);
+    }
     return_x_dtype = phi::DataType::FLOAT8_E4M3FN;
   } else if (expertwise_scale.has_value()) {
     EP_HOST_ASSERT(expertwise_scale.value().size(0) == num_experts);
@@ -1721,7 +1721,7 @@ Buffer::low_latency_dispatch(
 
   float* packed_recv_x_scales_ptr = nullptr;
 
-  if (use_fp8) {
+  if (use_fp8 && !expertwise_scale.has_value()) {
     EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 &&
                    "TMA requires the number of tokens to be multiple of 4");
     packed_recv_x_scales =
 
@@ -23,14 +23,15 @@
 #include <infiniband/mlx5dv.h>
 #include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
 #include <device_host_transport/nvshmem_common_ibgda.h>
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
 // clang-format on
-
 #include "paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh"
 #include "paddle/fluid/distributed/collective/deep_ep/kernels/exception.cuh"
 #include "paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh"
 #include "paddle/fluid/distributed/collective/deep_ep/kernels/launch.cuh"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
-
 namespace deep_ep {
 
 namespace internode_ll {
@@ -189,7 +190,32 @@ __global__ __launch_bounds__(
       // Note(zkk)
       // create a run_deepep_loop, so I need not modify Deepep's code any more.
       int run_deepep_loop = 1;
-      if (use_expertwise_scale) {
+      if (use_expertwise_scale && kUseFP8) {  // w4afp8
+        run_deepep_loop = 0;
+        for (int ii = 0; ii < num_topk; ii++) {
+          int tmp_id = topk_idx[ii + token_idx * num_topk];
+          float scale = expertwise_scale[tmp_id];
+          for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
+            auto int4_value = __ldg(x_int4 + i);
+            auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
+            int2 int2_value;
+            phi::AlignedVector<phi::dtype::float8_e4m3fn, 8> res_vec;
+            const float max_bound = 448.f;
+            const float min_bound = -448.f;
+            for (int j = 0; j < 8; j++) {
+              float quant_value =
+                  max_bound * scale * static_cast<float>(bf16_values[j]);
+              quant_value = quant_value > max_bound ? max_bound : quant_value;
+              quant_value = quant_value < min_bound ? min_bound : quant_value;
+              res_vec[j] = static_cast<phi::dtype::float8_e4m3fn>(quant_value);
+            }
+            phi::Store(res_vec,
+                       reinterpret_cast<phi::dtype::float8_e4m3fn*>(rdma_x) +
+                           (ii + token_idx * num_topk) * num_bytes_per_msg +
+                           sizeof(int4) + i * sizeof(res_vec));
+          }
+        }
+      } else if (use_expertwise_scale) {  // w4aint8
         run_deepep_loop = 0;
         for (int ii = 0; ii < num_topk; ii++) {
           int tmp_id = topk_idx[ii + token_idx * num_topk];
@@ -224,7 +250,7 @@ __global__ __launch_bounds__(
         // Read
         auto int4_value = __ldg(x_int4 + i);
 
-        if (kUseFP8) {
+        if (kUseFP8 && !use_expertwise_scale) {
           // Calculate local amax
           auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
           float fp32_values[kNumElemsPerRead];
@@ -502,7 +528,7 @@ LOW_LATENCY_DISPATCH_RECV:
                          st_na_global);
 
       // Copy scales
-      if (kUseFP8) {
+      if (kUseFP8 && !use_expertwise_scale) {
         const auto src_scales = reinterpret_cast<float*>(
             reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
         const auto dst_scales =
 
@@ -32,12 +32,6 @@ struct GpuDevice;
 
 namespace phi {
 
-// #ifndef BLAS_HANDLE_TYPE
-// #define BLAS_HANDLE_TYPE void*
-// // #else
-// // // using cublasHandle_t = struct cublasContext*;
-// #endif
-
 class CustomContext : public DeviceContext,
                       public TypeInfoTraits<DeviceContext, CustomContext> {
  public:
 
@@ -6333,7 +6333,7 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
       common::errors::InvalidArgument(
           "The x tensor of quant op must be 2D, but got[%d]", x_dims.size()));
 
-  if (algo == "w4a8") {
+  if (algo == "w4a8" || algo == "w4afp8") {
     PADDLE_ENFORCE_EQ(
         x_dims[0] % 32,
         0,
@@ -6379,10 +6379,12 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
     dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
   } else if (algo == "w4a8") {
     dim_out = vectorize(x_dims);
+  } else if (algo == "w4afp8") {
+    dim_out = vectorize(x_dims);
   } else {
     PADDLE_THROW(common::errors::InvalidArgument(
         "The algo must be in ['weight_only_int8', 'weight_only_int4', "
-        "'llm.int8', 'w4a8'], but got[%s]",
+        "'llm.int8', 'w4a8', 'w4afp8'], but got[%s]",
         algo));
   }
   out->set_dims(common::make_ddim(dim_out));
 
@@ -147,10 +147,17 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                      weight_shape,
                                      arch,
                                      algo);
+  } else if (algo == "w4afp8") {
+    weight_permute_gpu_w4afp8<Context>(dev_ctx,
+                                       x.data<int8_t>(),
+                                       out->data<int8_t>(),
+                                       weight_shape,
+                                       arch,
+                                       algo);
   } else {
     PADDLE_FATAL(
         "The algo must be in ['weight_only_int8', 'weight_only_int4', "
-        "'llm.int8', 'w4a8'], but got[%s]",
+        "'llm.int8', 'w4a8', 'w4afp8'], but got[%s]",
         algo);
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ runs:`
`22`	`22`	`uses: PFCCLab/ci-bypass@v1`
`23`	`23`	`with:`
`24`	`24`	`github-token: ${{ inputs.github-token }}`
`25`		`- non-pull-request-event-strategy: 'always-skipped'`
	`25`	`+ non-pull-request-event-strategy: 'never-skipped'`
`26`	`26`	`type: 'composite'`
`27`	`27`	`composite-rule: \|`
`28`	`28`	`{`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ jobs:`
`33`	`33`	`uses: PFCCLab/ci-bypass@v1`
`34`	`34`	`with:`
`35`	`35`	`github-token: ${{ secrets.GITHUB_TOKEN }}`
`36`		`- non-pull-request-event-strategy: 'always-skipped'`
	`36`	`+ non-pull-request-event-strategy: 'never-skipped'`
`37`	`37`	`type: 'composite'`
`38`	`38`	`composite-rule: \|`
`39`	`39`	`{`