Skip to content

Commit 9bd5ebc

Browse files
authored
Merge branch 'PaddlePaddle:develop' into adapter_aoa
2 parents 5863c6e + 1d9d7a3 commit 9bd5ebc

File tree

24 files changed

+724
-545
lines changed

24 files changed

+724
-545
lines changed

.github/actions/check-bypass/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ runs:
2222
uses: PFCCLab/ci-bypass@v1
2323
with:
2424
github-token: ${{ inputs.github-token }}
25-
non-pull-request-event-strategy: 'always-skipped'
25+
non-pull-request-event-strategy: 'never-skipped'
2626
type: 'composite'
2727
composite-rule: |
2828
{
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
name: Api-benchmark-baseline
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
PR_ID:
7+
required: false
8+
type: string
9+
COMMIT_ID:
10+
required: false
11+
type: string
12+
job-name:
13+
required: true
14+
default: 'api-benchmark'
15+
type: choice
16+
options:
17+
- api-benchmark
18+
- others
19+
schedule:
20+
- cron: '0 21 * * *'
21+
- cron: '0 22 * * 3'
22+
23+
permissions: read-all
24+
25+
defaults:
26+
run:
27+
shell: bash
28+
29+
jobs:
30+
clone:
31+
name: Api benchmark clone
32+
uses: ./.github/workflows/_Clone-linux.yml
33+
with:
34+
clone_dir: Paddle-build
35+
is_pr: 'false'
36+
37+
build-docker:
38+
name: Api benchmark build docker
39+
needs: clone
40+
uses: ./.github/workflows/docker.yml
41+
with:
42+
clone_dir: Paddle-build
43+
task: build
44+
45+
build:
46+
name: Api benchmark build
47+
if: github.event_name == 'schedule' && github.event.schedule == '0 21 * * *'
48+
needs: [clone, build-docker]
49+
uses: ./.github/workflows/_Linux-build.yml
50+
with:
51+
docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
52+
is_pr: 'false'
53+
54+
api-benchmark-baseline-schedule:
55+
name: Api benchmark baseline
56+
strategy:
57+
matrix:
58+
run-labels: [api-bm-20, api-bm-27]
59+
uses: ./.github/workflows/_Api-Benchmark.yml
60+
needs: [clone, build-docker, build]
61+
with:
62+
docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
63+
baseline: 'true'
64+
run-labels: ${{ matrix.run-labels }}
65+
66+
api-benchmark-baseline-pr:
67+
name: Api benchmark baseline
68+
if: github.event_name == 'workflow_dispatch' && github.event.inputs.job-name == 'api-benchmark'
69+
strategy:
70+
matrix:
71+
run-labels: [api-bm-20, api-bm-27]
72+
uses: ./.github/workflows/_Api-Benchmark.yml
73+
needs: [clone, build-docker]
74+
with:
75+
docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
76+
baseline: 'true'
77+
MANUALLY_PR_ID: ${{ inputs.PR_ID }}
78+
MANUALLY_COMMIT_ID: ${{ inputs.COMMIT_ID }}
79+
run-labels: ${{ matrix.run-labels }}
80+
81+
test1:
82+
runs-on: ubuntu-latest
83+
if: github.event.schedule == '0 0 * * *'
84+
steps:
85+
- name: Test
86+
run: |
87+
echo "test1"
88+
89+
test2:
90+
runs-on: ubuntu-latest
91+
if: github.event.schedule == '0 21 * * *'
92+
steps:
93+
- name: Test
94+
run: |
95+
echo "test2"
96+
97+
test3:
98+
runs-on: ubuntu-latest
99+
if: github.event.schedule == '0 22 * * 3'
100+
steps:
101+
- name: Test
102+
run: |
103+
echo "test3"
104+
105+
test4:
106+
runs-on: ubuntu-latest
107+
if: github.event.schedule == '0 21 * * 1'
108+
steps:
109+
- name: Test
110+
run: |
111+
echo "test4"

.github/workflows/_Api-Benchmark.yml

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,24 @@ on:
99
can-skip:
1010
type: string
1111
required: false
12+
baseline:
13+
type: string
14+
required: false
15+
default: "false"
16+
MANUALLY_PR_ID:
17+
type: string
18+
required: false
19+
MANUALLY_COMMIT_ID:
20+
type: string
21+
required: false
22+
run-labels:
23+
type: string
24+
required: false
25+
default: "api-bm"
1226

1327
env:
14-
PR_ID: ${{ github.event.pull_request.number }}
15-
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
28+
PR_ID: ${{ github.event.pull_request.number || '0' }}
29+
COMMIT_ID: ${{ github.event.pull_request.head.sha || github.sha }}
1630
work_dir: /paddle
1731
PADDLE_ROOT: /paddle
1832
TASK: paddle-CI-${{ github.event.pull_request.number }}-api-benchmark
@@ -41,6 +55,7 @@ jobs:
4155
if: ${{ needs.check-bypass.outputs.can-skip != 'true' }}
4256
runs-on:
4357
group: Api-bm
58+
labels: [self-hosted, "${{ inputs.run-labels }}"]
4459
steps:
4560
- name: Determine the runner
4661
run: |
@@ -118,7 +133,17 @@ jobs:
118133
cd ./PaddleTest/framework/e2e/api_benchmark_new
119134
cp /paddle/PTSTools/Uploader/apibm_config.yml .
120135
source ${{ github.workspace }}/../../../proxy
121-
${python} -m pip install https://paddle-github-action.bj.bcebos.com/PR/build/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
136+
if [[ "${{ inputs.baseline }}" == "true" ]];then
137+
if [[ "${{ inputs.MANUALLY_PR_ID }}" == "" ]]; then
138+
export pr_wheel_link=https://paddle-github-action.bj.bcebos.com/PR/build/${{ github.event.pull_request.number }}/${{ github.event.pull_request.head.sha }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
139+
else
140+
export pr_wheel_link=https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
141+
fi
142+
${python} -m pip install $pr_wheel_link
143+
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --baseline_whl_link $pr_wheel_link
144+
exit 0
145+
fi
146+
${python} -m pip install $wheel_link
122147
if [ ${core_index} -eq -1 ];then
123148
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --core_index 2
124149
else

.github/workflows/check-bypass.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
uses: PFCCLab/ci-bypass@v1
3434
with:
3535
github-token: ${{ secrets.GITHUB_TOKEN }}
36-
non-pull-request-event-strategy: 'always-skipped'
36+
non-pull-request-event-strategy: 'never-skipped'
3737
type: 'composite'
3838
composite-rule: |
3939
{

cmake/external/gtest.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ else()
126126
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
127127
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
128128
-DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
129+
-DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
129130
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
130131
-DBUILD_GMOCK=ON
131132
-Dgtest_disable_pthreads=ON

paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1685,11 +1685,11 @@ Buffer::low_latency_dispatch(
16851685
EP_HOST_ASSERT(!(async && return_recv_hook));
16861686
if (!return_recv_hook) stream_wait(launch_stream, compute_stream);
16871687

1688-
EP_HOST_ASSERT(
1689-
!(expertwise_scale.has_value() && use_fp8) &&
1690-
"expertwise_scale and use_fp8 can not arise at the same time.");
16911688
auto return_x_dtype = phi::DataType::BFLOAT16;
16921689
if (use_fp8) {
1690+
if (expertwise_scale.has_value()) {
1691+
EP_HOST_ASSERT(expertwise_scale.value().size(0) == num_experts);
1692+
}
16931693
return_x_dtype = phi::DataType::FLOAT8_E4M3FN;
16941694
} else if (expertwise_scale.has_value()) {
16951695
EP_HOST_ASSERT(expertwise_scale.value().size(0) == num_experts);
@@ -1721,7 +1721,7 @@ Buffer::low_latency_dispatch(
17211721

17221722
float* packed_recv_x_scales_ptr = nullptr;
17231723

1724-
if (use_fp8) {
1724+
if (use_fp8 && !expertwise_scale.has_value()) {
17251725
EP_HOST_ASSERT((num_ranks * num_max_dispatch_tokens_per_rank) % 4 == 0 &&
17261726
"TMA requires the number of tokens to be multiple of 4");
17271727
packed_recv_x_scales =

paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@
2323
#include <infiniband/mlx5dv.h>
2424
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
2525
#include <device_host_transport/nvshmem_common_ibgda.h>
26+
#ifdef __NVCC__
27+
#include <cub/cub.cuh>
28+
#endif
2629
// clang-format on
27-
2830
#include "paddle/fluid/distributed/collective/deep_ep/kernels/configs.cuh"
2931
#include "paddle/fluid/distributed/collective/deep_ep/kernels/exception.cuh"
3032
#include "paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh"
3133
#include "paddle/fluid/distributed/collective/deep_ep/kernels/launch.cuh"
3234
#include "paddle/phi/kernels/funcs/aligned_vector.h"
33-
3435
namespace deep_ep {
3536

3637
namespace internode_ll {
@@ -189,7 +190,32 @@ __global__ __launch_bounds__(
189190
// Note(zkk)
190191
// create a run_deepep_loop, so I need not modify Deepep's code any more.
191192
int run_deepep_loop = 1;
192-
if (use_expertwise_scale) {
193+
if (use_expertwise_scale && kUseFP8) { // w4afp8
194+
run_deepep_loop = 0;
195+
for (int ii = 0; ii < num_topk; ii++) {
196+
int tmp_id = topk_idx[ii + token_idx * num_topk];
197+
float scale = expertwise_scale[tmp_id];
198+
for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
199+
auto int4_value = __ldg(x_int4 + i);
200+
auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
201+
int2 int2_value;
202+
phi::AlignedVector<phi::dtype::float8_e4m3fn, 8> res_vec;
203+
const float max_bound = 448.f;
204+
const float min_bound = -448.f;
205+
for (int j = 0; j < 8; j++) {
206+
float quant_value =
207+
max_bound * scale * static_cast<float>(bf16_values[j]);
208+
quant_value = quant_value > max_bound ? max_bound : quant_value;
209+
quant_value = quant_value < min_bound ? min_bound : quant_value;
210+
res_vec[j] = static_cast<phi::dtype::float8_e4m3fn>(quant_value);
211+
}
212+
phi::Store(res_vec,
213+
reinterpret_cast<phi::dtype::float8_e4m3fn*>(rdma_x) +
214+
(ii + token_idx * num_topk) * num_bytes_per_msg +
215+
sizeof(int4) + i * sizeof(res_vec));
216+
}
217+
}
218+
} else if (use_expertwise_scale) { // w4aint8
193219
run_deepep_loop = 0;
194220
for (int ii = 0; ii < num_topk; ii++) {
195221
int tmp_id = topk_idx[ii + token_idx * num_topk];
@@ -224,7 +250,7 @@ __global__ __launch_bounds__(
224250
// Read
225251
auto int4_value = __ldg(x_int4 + i);
226252

227-
if (kUseFP8) {
253+
if (kUseFP8 && !use_expertwise_scale) {
228254
// Calculate local amax
229255
auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
230256
float fp32_values[kNumElemsPerRead];
@@ -502,7 +528,7 @@ LOW_LATENCY_DISPATCH_RECV:
502528
st_na_global);
503529

504530
// Copy scales
505-
if (kUseFP8) {
531+
if (kUseFP8 && !use_expertwise_scale) {
506532
const auto src_scales = reinterpret_cast<float*>(
507533
reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
508534
const auto dst_scales =

paddle/phi/backends/custom/custom_context.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,6 @@ struct GpuDevice;
3232

3333
namespace phi {
3434

35-
// #ifndef BLAS_HANDLE_TYPE
36-
// #define BLAS_HANDLE_TYPE void*
37-
// // #else
38-
// // // using cublasHandle_t = struct cublasContext*;
39-
// #endif
40-
4135
class CustomContext : public DeviceContext,
4236
public TypeInfoTraits<DeviceContext, CustomContext> {
4337
public:

paddle/phi/infermeta/unary.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6333,7 +6333,7 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
63336333
common::errors::InvalidArgument(
63346334
"The x tensor of quant op must be 2D, but got[%d]", x_dims.size()));
63356335

6336-
if (algo == "w4a8") {
6336+
if (algo == "w4a8" || algo == "w4afp8") {
63376337
PADDLE_ENFORCE_EQ(
63386338
x_dims[0] % 32,
63396339
0,
@@ -6379,10 +6379,12 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
63796379
dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
63806380
} else if (algo == "w4a8") {
63816381
dim_out = vectorize(x_dims);
6382+
} else if (algo == "w4afp8") {
6383+
dim_out = vectorize(x_dims);
63826384
} else {
63836385
PADDLE_THROW(common::errors::InvalidArgument(
63846386
"The algo must be in ['weight_only_int8', 'weight_only_int4', "
6385-
"'llm.int8', 'w4a8'], but got[%s]",
6387+
"'llm.int8', 'w4a8', 'w4afp8'], but got[%s]",
63866388
algo));
63876389
}
63886390
out->set_dims(common::make_ddim(dim_out));

paddle/phi/kernels/gpu/weight_quantize_kernel.cu

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,17 @@ void WeightQuantizeKernel(const Context& dev_ctx,
147147
weight_shape,
148148
arch,
149149
algo);
150+
} else if (algo == "w4afp8") {
151+
weight_permute_gpu_w4afp8<Context>(dev_ctx,
152+
x.data<int8_t>(),
153+
out->data<int8_t>(),
154+
weight_shape,
155+
arch,
156+
algo);
150157
} else {
151158
PADDLE_FATAL(
152159
"The algo must be in ['weight_only_int8', 'weight_only_int4', "
153-
"'llm.int8', 'w4a8'], but got[%s]",
160+
"'llm.int8', 'w4a8', 'w4afp8'], but got[%s]",
154161
algo);
155162
}
156163
}

0 commit comments

Comments
 (0)