Skip to content

Commit e0d5ee2

Browse files
authored
Merge branch 'develop' into add_bmm_out
2 parents 76fe32b + 1d9d7a3 commit e0d5ee2

File tree

24 files changed

+592
-546
lines changed

24 files changed

+592
-546
lines changed

.github/actions/check-bypass/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ runs:
2222
uses: PFCCLab/ci-bypass@v1
2323
with:
2424
github-token: ${{ inputs.github-token }}
25-
non-pull-request-event-strategy: 'always-skipped'
25+
non-pull-request-event-strategy: 'never-skipped'
2626
type: 'composite'
2727
composite-rule: |
2828
{

.github/workflows/Api-Benchmark-baseline.yml

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,13 @@ defaults:
2929
jobs:
3030
clone:
3131
name: Api benchmark clone
32-
if: github.event.schedule == '0 21 * * *'
3332
uses: ./.github/workflows/_Clone-linux.yml
3433
with:
3534
clone_dir: Paddle-build
3635
is_pr: 'false'
3736

3837
build-docker:
3938
name: Api benchmark build docker
40-
if: github.event.schedule == '0 21 * * *'
4139
needs: clone
4240
uses: ./.github/workflows/docker.yml
4341
with:
@@ -46,21 +44,33 @@ jobs:
4644

4745
build:
4846
name: Api benchmark build
49-
if: github.event.schedule == '0 21 * * *'
47+
if: github.event_name == 'schedule' && github.event.schedule == '0 21 * * *'
5048
needs: [clone, build-docker]
5149
uses: ./.github/workflows/_Linux-build.yml
5250
with:
5351
docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
5452
is_pr: 'false'
5553

56-
api-benchmark-baseline:
54+
api-benchmark-baseline-schedule:
5755
name: Api benchmark baseline
58-
if: github.event.schedule == '0 21 * * *' || github.event.inputs.job-name == 'api-benchmark'
5956
strategy:
6057
matrix:
6158
run-labels: [api-bm-20, api-bm-27]
6259
uses: ./.github/workflows/_Api-Benchmark.yml
6360
needs: [clone, build-docker, build]
61+
with:
62+
docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
63+
baseline: 'true'
64+
run-labels: ${{ matrix.run-labels }}
65+
66+
api-benchmark-baseline-pr:
67+
name: Api benchmark baseline
68+
if: github.event_name == 'workflow_dispatch' && github.event.inputs.job-name == 'api-benchmark'
69+
strategy:
70+
matrix:
71+
run-labels: [api-bm-20, api-bm-27]
72+
uses: ./.github/workflows/_Api-Benchmark.yml
73+
needs: [clone, build-docker]
6474
with:
6575
docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
6676
baseline: 'true'

.github/workflows/_Api-Benchmark.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,17 @@ jobs:
133133
cd ./PaddleTest/framework/e2e/api_benchmark_new
134134
cp /paddle/PTSTools/Uploader/apibm_config.yml .
135135
source ${{ github.workspace }}/../../../proxy
136-
${python} -m pip install $wheel_link
137136
if [[ "${{ inputs.baseline }}" == "true" ]];then
138137
if [[ "${{ inputs.MANUALLY_PR_ID }}" == "" ]]; then
139-
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link https://paddle-github-action.bj.bcebos.com/PR/build/${{ github.event.pull_request.number }}/${{ github.event.pull_request.head.sha }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
138+
export pr_wheel_link=https://paddle-github-action.bj.bcebos.com/PR/build/${{ github.event.pull_request.number }}/${{ github.event.pull_request.head.sha }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
140139
else
141-
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
140+
export pr_wheel_link=https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
142141
fi
142+
${python} -m pip install $pr_wheel_link
143+
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link $pr_wheel_link
143144
exit 0
144145
fi
146+
${python} -m pip install $wheel_link
145147
if [ ${core_index} -eq -1 ];then
146148
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --core_index 2
147149
else

.github/workflows/check-bypass.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
uses: PFCCLab/ci-bypass@v1
3434
with:
3535
github-token: ${{ secrets.GITHUB_TOKEN }}
36-
non-pull-request-event-strategy: 'always-skipped'
36+
non-pull-request-event-strategy: 'never-skipped'
3737
type: 'composite'
3838
composite-rule: |
3939
{

cmake/external/gtest.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ else()
126126
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
127127
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
128128
-DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
129+
-DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
129130
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
130131
-DBUILD_GMOCK=ON
131132
-Dgtest_disable_pthreads=ON

paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1685,11 +1685,11 @@ Buffer::low_latency_dispatch(
16851685
EP_HOST_ASSERT(!(async && return_recv_hook));
16861686
if (!return_recv_hook) stream_wait(launch_stream, compute_stream);
16871687

1688-
EP_HOST_ASSERT(
1689-
!(expertwise_scale.has_value() && use_fp8) &&
1690-
"expertwise_scale and use_fp8 can not arise at the same time.");
16911688
auto return_x_dtype = phi::DataType::BFLOAT16;
16921689
if (use_fp8) {
1690+
if (expertwise_scale.has_value()) {
1691+
EP_HOST_ASSERT(expertwise_scale.value().size(0) == num_experts);
1692+
}
16931693
return_x_dtype = phi::DataType::FLOAT8_E4M3FN;
16941694
} else if (expertwise_scale.has_value()) {
16951695
EP_HOST_ASSERT(expertwise_scale.value().size(0) == num_experts);
@@ -1721,7 +1721,7 @@ Buffer::low_latency_dispatch(
17211721

17221722
float* packed_recv_x_scales_ptr = nullptr;
17231723

1724-
if (use_fp8) {
1724+
if (use_fp8 && !expertwise_scale.has_value()) {
17251725
EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 &&
17261726
"TMA requires the number of tokens to be multiple of 4");
17271727
packed_recv_x_scales =

paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@
2323
#include <infiniband/mlx5dv.h>
2424
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
2525
#include <device_host_transport/nvshmem_common_ibgda.h>
26+
#ifdef __NVCC__
27+
#include <cub/cub.cuh>
28+
#endif
2629
// clang-format on
27-
2830
#include "paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh"
2931
#include "paddle/fluid/distributed/collective/deep_ep/kernels/exception.cuh"
3032
#include "paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh"
3133
#include "paddle/fluid/distributed/collective/deep_ep/kernels/launch.cuh"
3234
#include "paddle/phi/kernels/funcs/aligned_vector.h"
33-
3435
namespace deep_ep {
3536

3637
namespace internode_ll {
@@ -189,7 +190,32 @@ __global__ __launch_bounds__(
189190
// Note(zkk)
190191
// create a run_deepep_loop, so I need not modify Deepep's code any more.
191192
int run_deepep_loop = 1;
192-
if (use_expertwise_scale) {
193+
if (use_expertwise_scale && kUseFP8) { // w4afp8
194+
run_deepep_loop = 0;
195+
for (int ii = 0; ii < num_topk; ii++) {
196+
int tmp_id = topk_idx[ii + token_idx * num_topk];
197+
float scale = expertwise_scale[tmp_id];
198+
for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
199+
auto int4_value = __ldg(x_int4 + i);
200+
auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
201+
int2 int2_value;
202+
phi::AlignedVector<phi::dtype::float8_e4m3fn, 8> res_vec;
203+
const float max_bound = 448.f;
204+
const float min_bound = -448.f;
205+
for (int j = 0; j < 8; j++) {
206+
float quant_value =
207+
max_bound * scale * static_cast<float>(bf16_values[j]);
208+
quant_value = quant_value > max_bound ? max_bound : quant_value;
209+
quant_value = quant_value < min_bound ? min_bound : quant_value;
210+
res_vec[j] = static_cast<phi::dtype::float8_e4m3fn>(quant_value);
211+
}
212+
phi::Store(res_vec,
213+
reinterpret_cast<phi::dtype::float8_e4m3fn*>(rdma_x) +
214+
(ii + token_idx * num_topk) * num_bytes_per_msg +
215+
sizeof(int4) + i * sizeof(res_vec));
216+
}
217+
}
218+
} else if (use_expertwise_scale) { // w4aint8
193219
run_deepep_loop = 0;
194220
for (int ii = 0; ii < num_topk; ii++) {
195221
int tmp_id = topk_idx[ii + token_idx * num_topk];
@@ -224,7 +250,7 @@ __global__ __launch_bounds__(
224250
// Read
225251
auto int4_value = __ldg(x_int4 + i);
226252

227-
if (kUseFP8) {
253+
if (kUseFP8 && !use_expertwise_scale) {
228254
// Calculate local amax
229255
auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
230256
float fp32_values[kNumElemsPerRead];
@@ -502,7 +528,7 @@ LOW_LATENCY_DISPATCH_RECV:
502528
st_na_global);
503529

504530
// Copy scales
505-
if (kUseFP8) {
531+
if (kUseFP8 && !use_expertwise_scale) {
506532
const auto src_scales = reinterpret_cast<float*>(
507533
reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
508534
const auto dst_scales =

paddle/phi/backends/custom/custom_context.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,6 @@ struct GpuDevice;
3232

3333
namespace phi {
3434

35-
// #ifndef BLAS_HANDLE_TYPE
36-
// #define BLAS_HANDLE_TYPE void*
37-
// // #else
38-
// // // using cublasHandle_t = struct cublasContext*;
39-
// #endif
40-
4135
class CustomContext : public DeviceContext,
4236
public TypeInfoTraits<DeviceContext, CustomContext> {
4337
public:

paddle/phi/infermeta/unary.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6333,7 +6333,7 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
63336333
common::errors::InvalidArgument(
63346334
"The x tensor of quant op must be 2D, but got[%d]", x_dims.size()));
63356335

6336-
if (algo == "w4a8") {
6336+
if (algo == "w4a8" || algo == "w4afp8") {
63376337
PADDLE_ENFORCE_EQ(
63386338
x_dims[0] % 32,
63396339
0,
@@ -6379,10 +6379,12 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
63796379
dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
63806380
} else if (algo == "w4a8") {
63816381
dim_out = vectorize(x_dims);
6382+
} else if (algo == "w4afp8") {
6383+
dim_out = vectorize(x_dims);
63826384
} else {
63836385
PADDLE_THROW(common::errors::InvalidArgument(
63846386
"The algo must be in ['weight_only_int8', 'weight_only_int4', "
6385-
"'llm.int8', 'w4a8'], but got[%s]",
6387+
"'llm.int8', 'w4a8', 'w4afp8'], but got[%s]",
63866388
algo));
63876389
}
63886390
out->set_dims(common::make_ddim(dim_out));

paddle/phi/kernels/gpu/weight_quantize_kernel.cu

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,17 @@ void WeightQuantizeKernel(const Context& dev_ctx,
147147
weight_shape,
148148
arch,
149149
algo);
150+
} else if (algo == "w4afp8") {
151+
weight_permute_gpu_w4afp8<Context>(dev_ctx,
152+
x.data<int8_t>(),
153+
out->data<int8_t>(),
154+
weight_shape,
155+
arch,
156+
algo);
150157
} else {
151158
PADDLE_FATAL(
152159
"The algo must be in ['weight_only_int8', 'weight_only_int4', "
153-
"'llm.int8', 'w4a8'], but got[%s]",
160+
"'llm.int8', 'w4a8', 'w4afp8'], but got[%s]",
154161
algo);
155162
}
156163
}

0 commit comments

Comments
 (0)