From be933c943de3fc5757c00189221ed9cb8f2bc473 Mon Sep 17 00:00:00 2001 From: lijin23 <41257772+lj970926@users.noreply.github.com> Date: Sat, 28 Sep 2024 12:09:17 +0800 Subject: [PATCH 001/135] [XPU] support fused rope with none sin cos and refine code (#68501) --- .../fusion/xpu/fused_rope_grad_kernel.cc | 204 ++---- .../kernels/fusion/xpu/fused_rope_kernel.cc | 215 ++----- .../phi/kernels/fusion/xpu/fused_rope_utils.h | 585 ++++++++++++++---- ..._fused_rotary_position_embedding_op_xpu.py | 184 +++--- 4 files changed, 645 insertions(+), 543 deletions(-) diff --git a/paddle/phi/kernels/fusion/xpu/fused_rope_grad_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_rope_grad_kernel.cc index fe4de79c2b373..9a9ca69244fd4 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_rope_grad_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_rope_grad_kernel.cc @@ -18,6 +18,22 @@ namespace phi { namespace fusion { +#define LAUNCH_XPU_FUSED_ROPE_GRAD(T, SCT) \ + XPUFusedRopeImpl(dev_ctx, \ + dout_q, \ + dout_k, \ + dout_v, \ + sin, \ + cos, \ + position_ids, \ + use_neox_rotary_style, \ + time_major, \ + true, \ + rotary_emb_base, \ + dq, \ + dk, \ + dv); + template void FusedRopeGradKernel(const Context& dev_ctx, const paddle::optional& sin, @@ -32,175 +48,33 @@ void FusedRopeGradKernel(const Context& dev_ctx, DenseTensor* dq, DenseTensor* dk, DenseTensor* dv) { - using XPUType = typename XPUTypeTrait::Type; - if (dout_q.numel() <= 0) { - return; + dev_ctx.template Alloc(dq); + if (dout_k) { + dev_ctx.template Alloc(dk); } - int64_t batch_size = dout_q.dims()[0]; - int64_t seq_len = dout_q.dims()[1]; - int64_t num_heads = dout_q.dims()[2]; - int64_t head_dim = dout_q.dims()[3]; - PADDLE_ENFORCE_EQ(head_dim % 2, - 0, - common::errors::InvalidArgument( - "The head_dim of input must be a multiple of 2.")); - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - - int64_t sin_cos_len = batch_size * seq_len * head_dim; - if (use_neox_rotary_style) { - auto* sin_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - auto* cos_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - if (sin.get_ptr() && cos.get_ptr()) { - PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), - cos.get_ptr()->dims(), - common::errors::InvalidArgument( - "The dims of sin and cos must be the same. But " - "received sin's dims is {%s}, cos's dims is {%s}.", - sin.get_ptr()->dims(), - cos.get_ptr()->dims())); - } - XPUGetSinCosData( - dev_ctx, sin, position_ids, sin_data, batch_size, seq_len, head_dim); - XPUGetSinCosData( - dev_ctx, cos, position_ids, cos_data, batch_size, seq_len, head_dim); - if (!dout_k.get_ptr()) { - auto* dq_data = reinterpret_cast(dev_ctx.template Alloc(dq)); - int ret = xpu::rotary_embedding_v3_single_grad( - dev_ctx.x_context(), - reinterpret_cast(dout_q.data()), - cos_data, - sin_data, - dq_data, - batch_size, - seq_len, - num_heads, - head_dim, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_embedding_v3_single_grad"); + if (dout_v) { + dev_ctx.template Alloc(dv); + } + if (sin && cos) { + PADDLE_ENFORCE_EQ(sin->dims(), + cos->dims(), + common::errors::InvalidArgument( + "The dims of sin and cos must be the same. But " + "received sin's dims is {%s}, cos's dims is {%s}.", + sin->dims(), + cos->dims())); + if (sin->dtype() == phi::DataType::FLOAT32) { + LAUNCH_XPU_FUSED_ROPE_GRAD(T, float); } else { - int64_t num_heads_k = dout_k->dims()[2]; - auto* dq_data = reinterpret_cast(dev_ctx.template Alloc(dq)); - auto* dk_data = reinterpret_cast(dev_ctx.template Alloc(dk)); - int ret = xpu::rotary_embedding_v3_grad( - dev_ctx.x_context(), - reinterpret_cast(dout_q.data()), - reinterpret_cast(dout_k->data()), - cos_data, - sin_data, - dq_data, - dk_data, - batch_size, - seq_len, - num_heads, - head_dim, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, - {seq_len * num_heads_k * head_dim, - num_heads_k * head_dim, - head_dim, - 1}, - num_heads_k); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_embedding_v3_grad"); - } - if (dout_v.get_ptr()) { - int64_t num_heads_v = dout_v->dims()[2]; - auto* dv_data = reinterpret_cast(dev_ctx.template Alloc(dv)); - int ret = xpu::rotary_embedding_v3_single_grad( - dev_ctx.x_context(), - reinterpret_cast(dout_v->data()), - cos_data, - sin_data, - dv_data, - batch_size, - seq_len, - num_heads_v, - head_dim, - {seq_len * num_heads_v * head_dim, - num_heads_v * head_dim, - head_dim, - 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_embedding_v3_single_grad"); + PADDLE_ENFORCE_EQ( + phi::CppTypeToDataType::Type(), + sin->dtype(), + common::errors::InvalidArgument( + "The embedding dtype and sin/cos dtype mismatched.")); + LAUNCH_XPU_FUSED_ROPE_GRAD(T, T); } } else { - auto* sin_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - auto* cos_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - if (sin.get_ptr() && cos.get_ptr()) { - PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), - cos.get_ptr()->dims(), - common::errors::InvalidArgument( - "The dims of sin and cos must be the same. But " - "received sin's dims is {%s}, cos's dims is {%s}.", - sin.get_ptr()->dims(), - cos.get_ptr()->dims())); - } - XPUGetSinCosData( - dev_ctx, sin, position_ids, sin_data, batch_size, seq_len, head_dim); - XPUGetSinCosData( - dev_ctx, cos, position_ids, cos_data, batch_size, seq_len, head_dim); - if (head_dim * sizeof(T) <= 1024 && head_dim % 64 == 0 && dout_k) { - int64_t num_heads_k = dout_k->dims()[2]; - auto* dq_data = reinterpret_cast(dev_ctx.template Alloc(dq)); - auto* dk_data = reinterpret_cast(dev_ctx.template Alloc(dk)); - int ret = xpu::rotary_no_freqs_qk_embedding_v2_grad( - dev_ctx.x_context(), - reinterpret_cast(dout_q.data()), - reinterpret_cast(dout_k->data()), - sin_data, - cos_data, - dq_data, - dk_data, - {batch_size, seq_len, num_heads, head_dim}, - {batch_size, seq_len, 1, head_dim}, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, - {seq_len * head_dim, head_dim, head_dim, 1}, - num_heads_k); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_no_freqs_qk_embedding_v2_grad"); - } else { - auto* dq_data = reinterpret_cast(dev_ctx.template Alloc(dq)); - XPUFusedRotaryHalf( - dev_ctx, - reinterpret_cast(dout_q.data()), - sin_data, - cos_data, - dq_data, - batch_size, - seq_len, - num_heads, - head_dim, - true); - - if (dout_k.get_ptr()) { - int64_t num_heads_k = dout_k->dims()[2]; - auto* dk_data = - reinterpret_cast(dev_ctx.template Alloc(dk)); - XPUFusedRotaryHalf( - dev_ctx, - reinterpret_cast(dout_k->data()), - sin_data, - cos_data, - dk_data, - batch_size, - seq_len, - num_heads_k, - head_dim, - true); - } - } - - if (dout_v.get_ptr()) { - int64_t num_heads_v = dout_v->dims()[2]; - auto* dv_data = reinterpret_cast(dev_ctx.template Alloc(dv)); - XPUFusedRotaryHalf( - dev_ctx, - reinterpret_cast(dout_v->data()), - sin_data, - cos_data, - dv_data, - batch_size, - seq_len, - num_heads_v, - head_dim, - true); - } + LAUNCH_XPU_FUSED_ROPE_GRAD(T, float); } } } // namespace fusion diff --git a/paddle/phi/kernels/fusion/xpu/fused_rope_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_rope_kernel.cc index c44fc097db4bf..3c1044fca5443 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_rope_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_rope_kernel.cc @@ -18,7 +18,21 @@ namespace phi { namespace fusion { - +#define LAUNCH_XPU_FUSED_ROPE(T, SCT) \ + XPUFusedRopeImpl(dev_ctx, \ + q, \ + k, \ + v, \ + sin, \ + cos, \ + position_ids, \ + use_neox_rotary_style, \ + time_major, \ + false, \ + rotary_emb_base, \ + out_q, \ + out_k, \ + out_v); template void FusedRopeKernel(const Context& dev_ctx, const DenseTensor& q, @@ -33,185 +47,36 @@ void FusedRopeKernel(const Context& dev_ctx, DenseTensor* out_q, DenseTensor* out_k, DenseTensor* out_v) { - using XPUType = typename XPUTypeTrait::Type; - if (q.numel() <= 0) { - return; + dev_ctx.template Alloc(out_q); + if (k) { + dev_ctx.template Alloc(out_k); + } + if (v) { + dev_ctx.template Alloc(out_v); } - PADDLE_ENFORCE_EQ( - time_major, - false, - common::errors::InvalidArgument("time_major is not supported in xpu")); - int64_t batch_size = q.dims()[0]; - int64_t seq_len = q.dims()[1]; - int64_t num_heads = q.dims()[2]; - int64_t head_dim = q.dims()[3]; - PADDLE_ENFORCE_EQ(head_dim % 2, - 0, - common::errors::InvalidArgument( - "The head_dim of input must be a multiple of 2.")); - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - int64_t sin_cos_len = batch_size * seq_len * head_dim; - if (use_neox_rotary_style) { - auto* sin_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - auto* cos_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - if (sin.get_ptr() && cos.get_ptr()) { - PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), - cos.get_ptr()->dims(), - common::errors::InvalidArgument( - "The dims of sin and cos must be the same. But " - "received sin's dims is {%s}, cos's dims is {%s}.", - sin.get_ptr()->dims(), - cos.get_ptr()->dims())); - } - XPUGetSinCosData( - dev_ctx, sin, position_ids, sin_data, batch_size, seq_len, head_dim); - XPUGetSinCosData( - dev_ctx, cos, position_ids, cos_data, batch_size, seq_len, head_dim); - if (!k) { - auto* outq_data = - reinterpret_cast(dev_ctx.template Alloc(out_q)); - int ret = xpu::rotary_embedding_v3_single( - dev_ctx.x_context(), - reinterpret_cast(q.data()), - cos_data, - sin_data, - outq_data, - batch_size, - seq_len, - num_heads, - head_dim, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_embedding_v3_single"); + if (sin && cos) { + PADDLE_ENFORCE_EQ(sin->dims(), + cos->dims(), + common::errors::InvalidArgument( + "The dims of sin and cos must be the same. But " + "received sin's dims is {%s}, cos's dims is {%s}.", + sin->dims(), + cos->dims())); + // For user provided sin/cos, we use the dtype as is. + if (sin->dtype() == phi::DataType::FLOAT32) { + LAUNCH_XPU_FUSED_ROPE(T, float); } else { - int64_t num_heads_k = k->dims()[2]; - auto* outq_data = - reinterpret_cast(dev_ctx.template Alloc(out_q)); - auto* outk_data = - reinterpret_cast(dev_ctx.template Alloc(out_k)); - int ret = xpu::rotary_embedding_v3( - dev_ctx.x_context(), - reinterpret_cast(q.data()), - reinterpret_cast(k->data()), - cos_data, - sin_data, - outq_data, - outk_data, - batch_size, - seq_len, - num_heads, - head_dim, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, - {seq_len * num_heads_k * head_dim, - num_heads_k * head_dim, - head_dim, - 1}, - num_heads_k); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_embedding_v3"); - } - - if (v) { - int64_t num_heads_v = v->dims()[2]; - auto* outv_data = - reinterpret_cast(dev_ctx.template Alloc(out_v)); - int ret = xpu::rotary_embedding_v3_single( - dev_ctx.x_context(), - reinterpret_cast(v->data()), - cos_data, - sin_data, - outv_data, - batch_size, - seq_len, - num_heads_v, - head_dim, - {seq_len * num_heads_v * head_dim, - num_heads_v * head_dim, - head_dim, - 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_embedding_v3_single"); + PADDLE_ENFORCE_EQ( + phi::CppTypeToDataType::Type(), + sin->dtype(), + common::errors::InvalidArgument( + "The embedding dtype and sin/cos dtype mismatched.")); + LAUNCH_XPU_FUSED_ROPE(T, T); } } else { - auto* sin_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - auto* cos_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); - if (sin.get_ptr() && cos.get_ptr()) { - PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), - cos.get_ptr()->dims(), - common::errors::InvalidArgument( - "The dims of sin and cos must be the same. But " - "received sin's dims is {%s}, cos's dims is {%s}.", - sin.get_ptr()->dims(), - cos.get_ptr()->dims())); - } - XPUGetSinCosData( - dev_ctx, sin, position_ids, sin_data, batch_size, seq_len, head_dim); - XPUGetSinCosData( - dev_ctx, cos, position_ids, cos_data, batch_size, seq_len, head_dim); - if (head_dim * sizeof(T) <= 1024 && head_dim % 64 == 0 && k) { - int64_t num_heads_k = k->dims()[2]; - auto* outq_data = - reinterpret_cast(dev_ctx.template Alloc(out_q)); - auto* outk_data = - reinterpret_cast(dev_ctx.template Alloc(out_k)); - int ret = xpu::rotary_no_freqs_qk_embedding_v2( - dev_ctx.x_context(), - reinterpret_cast(q.data()), - reinterpret_cast(k->data()), - sin_data, - cos_data, - outq_data, - outk_data, - {batch_size, seq_len, num_heads, head_dim}, - {batch_size, seq_len, 1, head_dim}, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, - {seq_len * head_dim, head_dim, head_dim, 1}, - num_heads_k); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "rotary_no_freqs_qk_embedding_v2"); - } else { - auto* outq_data = - reinterpret_cast(dev_ctx.template Alloc(out_q)); - XPUFusedRotaryHalf( - dev_ctx, - reinterpret_cast(q.data()), - sin_data, - cos_data, - outq_data, - batch_size, - seq_len, - num_heads, - head_dim); - - if (k) { - int64_t num_heads_k = k->dims()[2]; - auto* outk_data = - reinterpret_cast(dev_ctx.template Alloc(out_k)); - XPUFusedRotaryHalf( - dev_ctx, - reinterpret_cast(k->data()), - sin_data, - cos_data, - outk_data, - batch_size, - seq_len, - num_heads_k, - head_dim); - } - } - - if (v) { - int64_t num_heads_v = v->dims()[2]; - auto* outv_data = - reinterpret_cast(dev_ctx.template Alloc(out_v)); - XPUFusedRotaryHalf( - dev_ctx, - reinterpret_cast(v->data()), - sin_data, - cos_data, - outv_data, - batch_size, - seq_len, - num_heads_v, - head_dim); - } + // For generated sin/cos, we use fp32 all. + LAUNCH_XPU_FUSED_ROPE(T, float); } } } // namespace fusion diff --git a/paddle/phi/kernels/fusion/xpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/xpu/fused_rope_utils.h index f45921cb1d0c1..6a9217c9e952a 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_rope_utils.h +++ b/paddle/phi/kernels/fusion/xpu/fused_rope_utils.h @@ -14,130 +14,505 @@ #pragma once #include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/common/memory_utils.h" namespace phi { namespace fusion { -template +template +void GetSinCosByPassValue(const Context& dev_ctx, + const paddle::optional& sin, + const paddle::optional& cos, + const paddle::optional& position_ids, + XPUSCType* sin_data, + XPUSCType* cos_data, + int64_t batch_size, + int64_t seq_len, + int64_t head_dim) { + PADDLE_ENFORCE_EQ((std::is_same::value), + true, + common::errors::Unimplemented( + "The xpu get_sin_cos_by_pass_value only supports " + "sin/cos with the same type as inputs now.")); + auto sin_cos_dims = sin->dims(); + int64_t dims_size = sin_cos_dims.size(); + int ret = xpu::SUCCESS; + PADDLE_ENFORCE_EQ( + (dims_size == 2 || dims_size == 4), + true, + common::errors::InvalidArgument("The dims of sin and cos is expected to " + "be 2 or 4, but received %d.", + dims_size)); + if (dims_size == 4) { + // sin.shape: [1, seq_len, 1, head_dim] + PADDLE_ENFORCE_EQ((sin_cos_dims[2] == 1), + true, + common::errors::InvalidArgument( + "The num_heads of sin and cos must be 1.")); + } + int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0; + if (position_ids) { + PADDLE_ENFORCE_EQ( + (sin_cos_dims[dims_size - 1] == head_dim && + sin_cos_dims[sin_seq_len_dim] >= seq_len), + true, + common::errors::InvalidArgument( + "The seq_len of sin and cos must be greater than or equal to " + "this of q. The head_dim of sin and cos must be the same as this " + "of q.")); + + auto position_ids_dims = position_ids->dims(); + PADDLE_ENFORCE_EQ(position_ids_dims.size(), + 2, + common::errors::InvalidArgument( + "The dims of position_ids is expected to " + "be 2, but received %d.", + position_ids_dims.size())); + + PADDLE_ENFORCE_EQ( + (position_ids_dims[0] == batch_size && position_ids_dims[1] == seq_len), + true, + common::errors::InvalidArgument( + "The batch_size and seq_len of position_ids must be the same as " + "those of q.")); + + ret = xpu::gather( + dev_ctx.x_context(), + reinterpret_cast(sin->data()), + position_ids->data(), + sin_data, + {seq_len, head_dim}, + batch_size * seq_len, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather"); + ret = xpu::gather( + dev_ctx.x_context(), + reinterpret_cast(cos->data()), + position_ids->data(), + cos_data, + {seq_len, head_dim}, + batch_size * seq_len, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather"); + } else { + int sin_cos_batch_size = (dims_size) == 4 ? sin_cos_dims[0] : 1; + ret = xpu::broadcast( + dev_ctx.x_context(), + reinterpret_cast(sin->data()), + sin_data, + {sin_cos_batch_size, seq_len, head_dim}, + {batch_size, seq_len, head_dim}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); + ret = xpu::broadcast( + dev_ctx.x_context(), + reinterpret_cast(cos->data()), + cos_data, + {sin_cos_batch_size, seq_len, head_dim}, + {batch_size, seq_len, head_dim}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); + } +} + +template +void GetSinCosByRotaryBase(const Context& dev_ctx, + XPUSCType* sin_data, + XPUSCType* cos_data, + int64_t batch_size, + int64_t seq_len, + int64_t head_dim, + float rotary_emb_base) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + + float* pos_seq_data = RAII_GUARD.alloc_l3_or_gm(seq_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(pos_seq_data); + int ret = + xpu::range(dev_ctx.x_context(), pos_seq_data, 0.0f, 1.0f, seq_len); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "range"); + float* freqs_half_data = RAII_GUARD.alloc_l3_or_gm(head_dim / 2); + PADDLE_ENFORCE_XDNN_NOT_NULL(freqs_half_data); + ret = xpu::range(dev_ctx.x_context(), + freqs_half_data, + 0.0f, + 2.0f / head_dim, + head_dim / 2); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "range"); + + float* rotary_base_xpu_data = RAII_GUARD.alloc_l3_or_gm(1); + PADDLE_ENFORCE_XDNN_NOT_NULL(rotary_base_xpu_data); + ret = xpu::constant( + dev_ctx.x_context(), rotary_base_xpu_data, 1, rotary_emb_base); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant"); + ret = xpu::broadcast_pow(dev_ctx.x_context(), + rotary_base_xpu_data, + freqs_half_data, + freqs_half_data, + {1}, + {head_dim / 2}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast_pow"); + ret = xpu::reciprocal( + dev_ctx.x_context(), freqs_half_data, freqs_half_data, head_dim / 2); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reciprocal"); + float* freqs_data = RAII_GUARD.alloc_l3_or_gm(head_dim); + ret = xpu::broadcast(dev_ctx.x_context(), + freqs_half_data, + freqs_data, + {head_dim / 2, 1}, + {head_dim / 2, 2}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); + int64_t rotary_len = seq_len * head_dim; + float* indices_data = RAII_GUARD.alloc_l3_or_gm(rotary_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(indices_data); + + ret = xpu::broadcast_mul(dev_ctx.x_context(), + pos_seq_data, + freqs_data, + indices_data, + {seq_len, 1}, + {1, head_dim}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast_mul"); + + float* sin_fp32_data = nullptr; + float* cos_fp32_data = nullptr; + XPUSCType* sin_part_data = nullptr; + XPUSCType* cos_part_data = nullptr; + bool need_cast = !std::is_same::value; + bool need_broadcast = batch_size > 1; + if (need_broadcast) { + sin_part_data = RAII_GUARD.alloc_l3_or_gm(rotary_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(sin_part_data); + cos_part_data = RAII_GUARD.alloc_l3_or_gm(rotary_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(cos_part_data); + } else { + sin_part_data = sin_data; + cos_part_data = cos_data; + } + if (need_cast) { + sin_fp32_data = RAII_GUARD.alloc_l3_or_gm(rotary_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(sin_fp32_data); + cos_fp32_data = RAII_GUARD.alloc_l3_or_gm(rotary_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(cos_fp32_data); + } else { + sin_fp32_data = reinterpret_cast(sin_part_data); + cos_fp32_data = reinterpret_cast(cos_part_data); + } + ret = xpu::sin( + dev_ctx.x_context(), indices_data, sin_fp32_data, rotary_len); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "sin"); + ret = xpu::cos( + dev_ctx.x_context(), indices_data, cos_fp32_data, rotary_len); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "cos"); + + if (need_cast) { + ret = xpu::cast( + dev_ctx.x_context(), sin_fp32_data, sin_part_data, rotary_len); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "cast"); + ret = xpu::cast( + dev_ctx.x_context(), cos_fp32_data, cos_part_data, rotary_len); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "cast"); + } + + if (need_broadcast) { + ret = xpu::broadcast(dev_ctx.x_context(), + sin_part_data, + sin_data, + {1, seq_len, head_dim}, + {batch_size, seq_len, head_dim}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); + ret = xpu::broadcast(dev_ctx.x_context(), + cos_part_data, + cos_data, + {1, seq_len, head_dim}, + {batch_size, seq_len, head_dim}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); + } +} + +template void XPUGetSinCosData(const Context& dev_ctx, - const paddle::optional& sin_cos, + const paddle::optional& sin, + const paddle::optional& cos, const paddle::optional& position_ids, - XPUType* sin_cos_data, + XPUSCType* sin_data, + XPUSCType* cos_data, int64_t batch_size, int64_t seq_len, - int64_t head_dim) { - if (sin_cos.get_ptr()) { - auto sin_cos_dims = sin_cos.get_ptr()->dims(); - int64_t dims_size = sin_cos_dims.size(); - PADDLE_ENFORCE_EQ((dims_size == 2 || dims_size == 4), - true, - common::errors::InvalidArgument( - "The dims of sin and cos is expected to " - "be 2 or 4, but received %d.", - dims_size)); - if (dims_size == 4) { - // sin.shape: [1, seq_len, 1, head_dim] - PADDLE_ENFORCE_EQ((sin_cos_dims[2] == 1), - true, - common::errors::InvalidArgument( - "The num_heads of sin and cos must be 1.")); - } - int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0; - if (position_ids.get_ptr()) { - PADDLE_ENFORCE_EQ( - (sin_cos_dims[dims_size - 1] == head_dim && - sin_cos_dims[sin_seq_len_dim] >= seq_len), - true, - common::errors::InvalidArgument( - "The seq_len of sin and cos must be greater than or equal to " - "this of q. The head_dim of sin and cos must be the same as this " - "of q.")); - - auto position_ids_dims = position_ids.get_ptr()->dims(); - PADDLE_ENFORCE_EQ(position_ids_dims.size(), - 2, - common::errors::InvalidArgument( - "The dims of position_ids is expected to " - "be 2, but received %d.", - position_ids_dims.size())); - - PADDLE_ENFORCE_EQ( - (position_ids_dims[0] == batch_size && - position_ids_dims[1] == seq_len), - true, - common::errors::InvalidArgument( - "The batch_size and seq_len of position_ids must be the same as " - "those of q.")); - using XPUTypeFp16 = typename XPUTypeTrait::Type; - using XPUTypeBf16 = typename XPUTypeTrait::Type; - if (std::is_same::value) { - int ret = xpu::gather( - dev_ctx.x_context(), - reinterpret_cast(sin_cos->data()), - position_ids->data(), - reinterpret_cast(sin_cos_data), - {seq_len, head_dim}, - batch_size * seq_len, - 0); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather"); - } else { - int ret = xpu::gather( - dev_ctx.x_context(), - reinterpret_cast(sin_cos->data()), - position_ids->data(), - sin_cos_data, - {seq_len, head_dim}, - batch_size * seq_len, - 0); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather"); - } - } else { - int sin_cos_batch_size = (dims_size) == 4 ? sin_cos_dims[0] : 1; - int ret = xpu::broadcast( - dev_ctx.x_context(), - reinterpret_cast(sin_cos->data()), - sin_cos_data, - {sin_cos_batch_size, seq_len, head_dim}, - {batch_size, seq_len, head_dim}); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); - } + int64_t head_dim, + float rotary_emb_base) { + if (sin && cos) { + GetSinCosByPassValue(dev_ctx, + sin, + cos, + position_ids, + sin_data, + cos_data, + batch_size, + seq_len, + head_dim); } else { - int ret = xpu::constant(dev_ctx.x_context(), - sin_cos_data, - batch_size * seq_len * head_dim, - static_cast(0.0f)); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant"); + GetSinCosByRotaryBase(dev_ctx, + sin_data, + cos_data, + batch_size, + seq_len, + head_dim, + rotary_emb_base); + } +} + +template +void XPUFusedRotaryEveryTwo(const Context& dev_ctx, + const DenseTensor& in_q, + const paddle::optional& in_k, + const paddle::optional& in_v, + const XPUSCType* sin_data, + const XPUSCType* cos_data, + int64_t batch_size, + int64_t seq_len, + int64_t num_heads, + int64_t head_dim, + bool time_major, + bool is_bwd, + DenseTensor* out_q, + DenseTensor* out_k, + DenseTensor* out_v) { + auto single_func = &xpu::rotary_embedding_v3_single; + auto fusion_func = &xpu::rotary_embedding_v3; + const char* single_func_name = "rotary_embedding_v3_single"; + const char* fusion_func_name = "rotary_embedding_v3"; + if (is_bwd) { + single_func = &xpu::rotary_embedding_v3_single_grad; + fusion_func = &xpu::rotary_embedding_v3_grad; + single_func_name = "rotary_embedding_v3_single_grad"; + fusion_func_name = "rotary_embedding_v3_grad"; + } + if (!in_k) { + int ret = single_func( + dev_ctx.x_context(), + reinterpret_cast(in_q.data()), + cos_data, + sin_data, + reinterpret_cast(out_q->data()), + batch_size, + seq_len, + num_heads, + head_dim, + {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, + "BLHD", + true); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, single_func_name); + } else { + int64_t num_heads_k = in_k->dims()[2]; + int ret = fusion_func( + dev_ctx.x_context(), + reinterpret_cast(in_q.data()), + reinterpret_cast(in_k->data()), + cos_data, + sin_data, + reinterpret_cast(out_q->data()), + reinterpret_cast(out_k->data()), + batch_size, + seq_len, + num_heads, + head_dim, + {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, + {seq_len * num_heads_k * head_dim, num_heads_k * head_dim, head_dim, 1}, + num_heads_k, + "BLHD", + true); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, fusion_func_name); + } + + if (in_v) { + int64_t num_heads_v = in_v->dims()[2]; + int ret = single_func( + dev_ctx.x_context(), + reinterpret_cast(in_v->data()), + cos_data, + sin_data, + reinterpret_cast(out_v->data()), + batch_size, + seq_len, + num_heads_v, + head_dim, + {seq_len * num_heads_v * head_dim, num_heads_v * head_dim, head_dim, 1}, + "BLHD", + true); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, single_func_name); } } -template +template void XPUFusedRotaryHalf(const Context& dev_ctx, - const XPUType* in_data, - const XPUType* sin_data, - const XPUType* cos_data, - XPUType* out_data, + const DenseTensor& in_q, + const paddle::optional& in_k, + const paddle::optional& in_v, + const XPUSCType* sin_data, + const XPUSCType* cos_data, int64_t batch_size, int64_t seq_len, int64_t num_heads, int64_t head_dim, - bool is_bwd = false) { - auto func = &xpu::rotary_no_freqs_embedding_v2; + bool time_major, + bool is_bwd, + DenseTensor* out_q, + DenseTensor* out_k, + DenseTensor* out_v) { + PADDLE_ENFORCE_EQ( + (std::is_same::value), + true, + common::errors::Unimplemented("The xpu rotary half do not support " + "sin/cos with different dtype as input.")); + auto single_func = &xpu::rotary_no_freqs_embedding_v2; + auto fusion_func = &xpu::rotary_no_freqs_qk_embedding_v2; + const char* single_func_name = "rotary_no_freqs_embedding_v2"; + const char* fusion_func_name = "xpu::rotary_no_freqs_qk_embedding_v2"; if (is_bwd) { - func = &xpu::rotary_no_freqs_embedding_v2_grad; + single_func = &xpu::rotary_no_freqs_embedding_v2_grad; + fusion_func = &xpu::rotary_no_freqs_qk_embedding_v2_grad; } - int ret = - func(dev_ctx.x_context(), - in_data, - sin_data, - cos_data, - out_data, - {batch_size, seq_len, num_heads, head_dim}, - {batch_size, seq_len, 1, head_dim}, - {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, - {seq_len * head_dim, head_dim, head_dim, 1}); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, - is_bwd ? "rotary_no_freqs_embedding_v2_grad" - : "rotary_no_freqs_embedding_v2"); + if (head_dim * sizeof(XPUType) <= 1024 && head_dim % 64 == 0 && in_k) { + int64_t num_heads_k = in_k->dims()[2]; + int ret = fusion_func( + dev_ctx.x_context(), + reinterpret_cast(in_q.data()), + reinterpret_cast(in_k->data()), + reinterpret_cast(sin_data), + reinterpret_cast(cos_data), + reinterpret_cast(out_q->data()), + reinterpret_cast(out_k->data()), + {batch_size, seq_len, num_heads, head_dim}, + {batch_size, seq_len, 1, head_dim}, + {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, + {seq_len * head_dim, head_dim, head_dim, 1}, + num_heads_k); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, fusion_func_name); + } else { + int ret = single_func( + dev_ctx.x_context(), + reinterpret_cast(in_q.data()), + reinterpret_cast(sin_data), + reinterpret_cast(cos_data), + reinterpret_cast(out_q->data()), + {batch_size, seq_len, num_heads, head_dim}, + {batch_size, seq_len, 1, head_dim}, + {seq_len * num_heads * head_dim, num_heads * head_dim, head_dim, 1}, + {seq_len * head_dim, head_dim, head_dim, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, single_func_name); + if (in_k) { + int64_t num_heads_k = in_k->dims()[2]; + int ret = single_func(dev_ctx.x_context(), + reinterpret_cast(in_k->data()), + reinterpret_cast(sin_data), + reinterpret_cast(cos_data), + reinterpret_cast(out_k->data()), + {batch_size, seq_len, num_heads_k, head_dim}, + {batch_size, seq_len, 1, head_dim}, + {seq_len * num_heads_k * head_dim, + num_heads_k * head_dim, + head_dim, + 1}, + {seq_len * head_dim, head_dim, head_dim, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, single_func_name); + } + } + + if (in_v) { + int64_t num_heads_v = in_v->dims()[2]; + int ret = single_func( + dev_ctx.x_context(), + reinterpret_cast(in_v->data()), + reinterpret_cast(sin_data), + reinterpret_cast(cos_data), + reinterpret_cast(out_v->data()), + {batch_size, seq_len, num_heads_v, head_dim}, + {batch_size, seq_len, 1, head_dim}, + {seq_len * num_heads_v * head_dim, num_heads_v * head_dim, head_dim, 1}, + {seq_len * head_dim, head_dim, head_dim, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, single_func_name); + } +} +template +void XPUFusedRopeImpl(const Context& dev_ctx, + const DenseTensor& q, + const paddle::optional& k, + const paddle::optional& v, + const paddle::optional& sin, + const paddle::optional& cos, + const paddle::optional& position_ids, + bool use_neox_rotary_style, + bool time_major, + bool is_bwd, + float rotary_emb_base, + DenseTensor* out_q, + DenseTensor* out_k, + DenseTensor* out_v) { + using XPUType = typename XPUTypeTrait::Type; + using XPUSCType = typename XPUTypeTrait::Type; + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + if (q.numel() <= 0) { + return; + } + int64_t batch_size = q.dims()[0]; + int64_t seq_len = q.dims()[1]; + int64_t num_heads = q.dims()[2]; + int64_t head_dim = q.dims()[3]; + PADDLE_ENFORCE_EQ(head_dim % 2, + 0, + common::errors::InvalidArgument( + "The head_dim of input must be a multiple of 2.")); + PADDLE_ENFORCE_EQ( + time_major, + false, + common::errors::InvalidArgument("time_major is not supported in xpu")); + + int64_t sin_cos_len = batch_size * seq_len * head_dim; + auto* sin_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(sin_data); + auto* cos_data = RAII_GUARD.alloc_l3_or_gm(sin_cos_len); + PADDLE_ENFORCE_XDNN_NOT_NULL(cos_data); + XPUGetSinCosData(dev_ctx, + sin, + cos, + position_ids, + sin_data, + cos_data, + batch_size, + seq_len, + head_dim, + rotary_emb_base); + if (use_neox_rotary_style) { + XPUFusedRotaryEveryTwo(dev_ctx, + q, + k, + v, + sin_data, + cos_data, + batch_size, + seq_len, + num_heads, + head_dim, + time_major, + is_bwd, + out_q, + out_k, + out_v); + } else { + XPUFusedRotaryHalf(dev_ctx, + q, + k, + v, + sin_data, + cos_data, + batch_size, + seq_len, + num_heads, + head_dim, + time_major, + is_bwd, + out_q, + out_k, + out_v); + } } } // namespace fusion } // namespace phi diff --git a/test/xpu/test_fused_rotary_position_embedding_op_xpu.py b/test/xpu/test_fused_rotary_position_embedding_op_xpu.py index 283623dcc9f32..88b04cf268fb0 100644 --- a/test/xpu/test_fused_rotary_position_embedding_op_xpu.py +++ b/test/xpu/test_fused_rotary_position_embedding_op_xpu.py @@ -56,38 +56,21 @@ def mult_qkv_rotate_half(value, cos_tensor, sin_tensor): return query -def get_sin_cos_tensor(seq_len, head_dim, sign=1, dtype="float32"): +def get_sin_cos_tensor(seq_len, head_dim, dtype="float32"): pos_seq = paddle.arange(0, seq_len, 1).astype("float32") indices = paddle.arange(0, head_dim, 2).astype("float32") - indices = 1 / 10000 ** (indices / head_dim) + indices = 1 / (10000 ** (indices / head_dim)) sinusoid_inp = pos_seq.unsqueeze(1) * indices.unsqueeze(0) + sinusoid_inp = paddle.stack([sinusoid_inp, sinusoid_inp], axis=-1) + sin = paddle.sin(sinusoid_inp) + cos = paddle.cos(sinusoid_inp) + sin = sin.astype(dtype).reshape([1, seq_len, 1, head_dim]) + cos = cos.astype(dtype).reshape([1, seq_len, 1, head_dim]) + return sin, cos - sin_sin = paddle.empty([seq_len * head_dim], dtype=dtype) - cos_cos = paddle.empty([seq_len * head_dim], dtype=dtype) - i = 0 - - for value in sinusoid_inp.flatten(): - sin_sin[i * 2] = sign * paddle.sin(value) - cos_cos[i * 2 + 0] = paddle.cos(value) - sin_sin[i * 2 + 1] = paddle.sin(value) - cos_cos[i * 2 + 1] = paddle.cos(value) - i += 1 - - tensor_sin = paddle.reshape( - sin_sin, - [1, seq_len, 1, head_dim], - ) - tensor_cos = paddle.reshape( - cos_cos, - [1, seq_len, 1, head_dim], - ) - - return tensor_sin.astype(dtype), tensor_cos.astype(dtype) - - -def paddle_fused_rotary_position_embedding( +def ref_rotary_position_embedding( init_q, init_k=None, init_v=None, @@ -144,7 +127,7 @@ def get_paddle_tensor(self, shape): if shape is None: return None - tmp = paddle.randn(shape, self.dtype) + tmp = paddle.uniform(shape, self.dtype, -1.0, 1.0) tmp.stop_gradient = False return tmp @@ -167,7 +150,7 @@ def get_inputs(self, seed, with_sin_cos, dtype="float32"): tensor_v = self.get_paddle_tensor(self.shape_v) tensor_sin, tensor_cos = ( - get_sin_cos_tensor(tensor_q.shape[1], tensor_q.shape[3], 1, dtype) + get_sin_cos_tensor(tensor_q.shape[1], tensor_q.shape[3], dtype) if with_sin_cos else (None, None) ) @@ -202,9 +185,9 @@ def get_forward_backward( fw.append(out_k) fw.append(out_v) paddle.seed(seed + 1) - out_gq = paddle.randn(out_q.shape, self.dtype) - out_gk = paddle.randn(out_k.shape, self.dtype) - out_gv = paddle.randn(out_v.shape, self.dtype) + out_gq = paddle.uniform(out_q.shape, self.dtype, -1.0, 1.0) + out_gk = paddle.uniform(out_k.shape, self.dtype, -1.0, 1.0) + out_gv = paddle.uniform(out_v.shape, self.dtype, -1.0, 1.0) paddle.autograd.backward( [out_q, out_k, out_v], [out_gq, out_gk, out_gv], True @@ -215,52 +198,63 @@ def get_forward_backward( return fw, bw + def check_forward_backward( + self, ref_fwd, fused_fwd, ref_bwd=None, fused_bwd=None + ): + for i in range(len(ref_fwd)): + ref_fwd_np = ref_fwd[i].numpy() + fused_fwd_np = fused_fwd[i].numpy() + if ref_bwd is not None: + ref_bwd_np = ref_bwd[i].numpy() + fused_bwd_np = fused_bwd[i].numpy() + if self.dtype == "bfloat16": + ref_fwd_np = convert_uint16_to_float(ref_fwd_np) + fused_fwd_np = convert_uint16_to_float(fused_fwd_np) + if ref_bwd is not None: + ref_bwd_np = convert_uint16_to_float(ref_bwd_np) + fused_bwd_np = convert_uint16_to_float(fused_bwd_np) + np.testing.assert_allclose( + ref_fwd_np, fused_fwd_np, rtol=self.rtol, atol=self.atol + ) + if ref_bwd is not None: + np.testing.assert_allclose( + ref_bwd_np, fused_bwd_np, rtol=self.rtol, atol=self.atol + ) + def test_fused_rope(self): paddle.set_device('xpu') p_fw, p_bw = self.get_forward_backward( - paddle_fused_rotary_position_embedding, + ref_rotary_position_embedding, seed=self.seed, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) f_fw, f_bw = self.get_forward_backward( fused_rotary_position_embedding, seed=self.seed, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) - for i in range(len(p_fw)): - np.testing.assert_allclose( - p_fw[i].numpy(), f_fw[i].numpy(), rtol=self.rtol, atol=self.atol - ) - np.testing.assert_allclose( - p_bw[i].numpy(), f_bw[i].numpy(), rtol=self.rtol, atol=self.atol - ) + self.check_forward_backward(p_fw, f_fw, p_bw, f_bw) - def test_fused_rope_with_sin_cos(self): + def test_fused_rope_without_sin_cos(self): paddle.set_device('xpu') p_fw, p_bw = self.get_forward_backward( - paddle_fused_rotary_position_embedding, + ref_rotary_position_embedding, seed=self.seed, with_sin_cos=True, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) f_fw, f_bw = self.get_forward_backward( fused_rotary_position_embedding, seed=self.seed, - with_sin_cos=True, - use_neox_rotary_style=False, + with_sin_cos=False, + use_neox_rotary_style=True, ) - for i in range(len(p_fw)): - np.testing.assert_allclose( - p_fw[i].numpy(), f_fw[i].numpy(), rtol=self.rtol, atol=self.atol - ) - np.testing.assert_allclose( - p_bw[i].numpy(), f_bw[i].numpy(), rtol=self.rtol, atol=self.atol - ) + self.check_forward_backward(p_fw, f_fw, p_bw, f_bw) def test_fused_rope_rotate_half(self): paddle.set_device('xpu') p_fw, p_bw = self.get_forward_backward( - paddle_fused_rotary_position_embedding, + ref_rotary_position_embedding, seed=self.seed, use_neox_rotary_style=False, ) @@ -269,13 +263,7 @@ def test_fused_rope_rotate_half(self): seed=self.seed, use_neox_rotary_style=False, ) - for i in range(len(p_fw)): - np.testing.assert_allclose( - p_fw[i].numpy(), f_fw[i].numpy(), rtol=self.rtol, atol=self.atol - ) - np.testing.assert_allclose( - p_bw[i].numpy(), f_bw[i].numpy(), rtol=self.rtol, atol=self.atol - ) + self.check_forward_backward(p_fw, f_fw, p_bw, f_bw) def test_fused_rope_position_ids(self): paddle.set_device('xpu') @@ -283,7 +271,7 @@ def test_fused_rope_position_ids(self): [[7, 5, 4, 6, 3, 1, 2, 0], [3, 1, 4, 0, 7, 6, 5, 2]] ) p_fw, p_bw = self.get_forward_backward( - paddle_fused_rotary_position_embedding, + ref_rotary_position_embedding, seed=self.seed, position_ids=position_ids, use_neox_rotary_style=False, @@ -294,13 +282,7 @@ def test_fused_rope_position_ids(self): position_ids=position_ids, use_neox_rotary_style=False, ) - for i in range(len(p_fw)): - np.testing.assert_allclose( - p_fw[i].numpy(), f_fw[i].numpy(), rtol=self.rtol, atol=self.atol - ) - np.testing.assert_allclose( - p_bw[i].numpy(), f_bw[i].numpy(), rtol=self.rtol, atol=self.atol - ) + self.check_forward_backward(p_fw, f_fw, p_bw, f_bw) def test_static(self): paddle.set_device('xpu') @@ -308,9 +290,9 @@ def test_static(self): self.seed, True, self.dtype ) p_fw, p_bw = self.get_forward_backward( - paddle_fused_rotary_position_embedding, + ref_rotary_position_embedding, seed=self.seed, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) paddle.enable_static() @@ -342,7 +324,7 @@ def test_static(self): sin, cos, position_ids=None, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) exe = paddle.static.Executor() @@ -359,11 +341,16 @@ def test_static(self): feed=feed, fetch_list=[out_q, out_k, out_v], ) - for i in range(3): + ref_fwd_np = p_fw[i].numpy() + fused_fwd_np = outs[i] + if self.dtype == "bfloat16": + ref_fwd_np = convert_uint16_to_float(ref_fwd_np) + fused_fwd_np = convert_uint16_to_float(fused_fwd_np) np.testing.assert_allclose( - p_fw[i].numpy(), outs[i], rtol=self.rtol, atol=self.atol + ref_fwd_np, fused_fwd_np, rtol=self.rtol, atol=self.atol ) + paddle.disable_static() @@ -377,22 +364,32 @@ def init_case(self): self.dtype = "float16" -class XPUTestFusedRotaryPositionEmbeddingBf16_1(unittest.TestCase): - def setUp(self): +class XPUTestFusedRotaryPositionEmbeddingBf16_1( + XPUTestFusedRotaryPositionEmbedding +): + def init_case(self): self.shape_q = [2, 8, 2, 16] self.shape_k = [2, 8, 2, 16] self.shape_v = [2, 8, 2, 16] + self.dtype = "bfloat16" + + +class XPUTestFusedRotaryPositionEmbeddingBf16_2(unittest.TestCase): + def setUp(self): + self.shape_q = [2, 2048, 16, 128] + self.shape_k = [2, 2048, 16, 128] + self.shape_v = [2, 2048, 16, 128] def test_api(self): paddle.disable_static() - q_bf16 = paddle.randn(self.shape_q, dtype="bfloat16") - k_bf16 = paddle.randn(self.shape_k, dtype="bfloat16") - v_bf16 = paddle.randn(self.shape_v, dtype="bfloat16") - sin_bf16 = paddle.randn( - [1, self.shape_q[1], 1, self.shape_q[3]], dtype="bfloat16" + q_bf16 = paddle.uniform(self.shape_q, "bfloat16", -1.0, 1.0) + k_bf16 = paddle.uniform(self.shape_k, "bfloat16", -1.0, 1.0) + v_bf16 = paddle.uniform(self.shape_v, "bfloat16", -1.0, 1.0) + sin_bf16 = paddle.uniform( + [1, self.shape_q[1], 1, self.shape_q[3]], "bfloat16", -1.0, 1.0 ) - cos_bf16 = paddle.randn( - [1, self.shape_q[1], 1, self.shape_q[3]], dtype="bfloat16" + cos_bf16 = paddle.uniform( + [1, self.shape_q[1], 1, self.shape_q[3]], "bfloat16", -1.0, 1.0 ) q_bf16.stop_gradient = False k_bf16.stop_gradient = False @@ -414,26 +411,26 @@ def test_api(self): sin_bf16, cos_bf16, position_ids=position_ids, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) - grad_out_q_bf16 = paddle.randn(self.shape_q, dtype="bfloat16") - grad_out_k_bf16 = paddle.randn(self.shape_k, dtype="bfloat16") - grad_out_v_bf16 = paddle.randn(self.shape_v, dtype="bfloat16") + grad_out_q_bf16 = paddle.uniform(self.shape_q, "bfloat16", -1.0, 1.0) + grad_out_k_bf16 = paddle.uniform(self.shape_k, "bfloat16", -1.0, 1.0) + grad_out_v_bf16 = paddle.uniform(self.shape_v, "bfloat16", -1.0, 1.0) paddle.autograd.backward( out_bf16, [grad_out_q_bf16, grad_out_k_bf16, grad_out_v_bf16], True ) grad_bf16 = [q_bf16.grad, k_bf16.grad, v_bf16.grad] - out_fp32 = paddle_fused_rotary_position_embedding( + out_fp32 = ref_rotary_position_embedding( q_fp32, k_fp32, v_fp32, sin_fp32, cos_fp32, position_ids=position_ids, - use_neox_rotary_style=False, + use_neox_rotary_style=True, ) grad_out_q_fp32 = paddle.to_tensor(grad_out_q_bf16, dtype="float32") @@ -456,15 +453,6 @@ def test_api(self): ) -class XPUTestFusedRotaryPositionEmbeddingBf16_2( - XPUTestFusedRotaryPositionEmbeddingBf16_1 -): - def setUp(self): - self.shape_q = [2, 2048, 16, 128] - self.shape_k = [2, 2048, 16, 128] - self.shape_v = [2, 2048, 16, 128] - - class XPUTestFusedRotaryPositionEmbeddingGQA( XPUTestFusedRotaryPositionEmbedding ): From 1c89f08d391c783b25b47fa71682e7a02ef4af65 Mon Sep 17 00:00:00 2001 From: Chen Zhiyang <1792266893@qq.com> Date: Sat, 28 Sep 2024 12:44:04 +0800 Subject: [PATCH 002/135] [PIR]add patch header template (#68481) * add patch header template * fix ci --- .../pir/serialize_deserialize/CMakeLists.txt | 25 ++++- .../include/patch_util.h | 3 +- .../serialize_deserialize/include/schema.h | 12 ++- .../include/version_compat.h | 8 +- .../pir/serialize_deserialize/patch/1.yaml | 0 .../pir/serialize_deserialize/patch/Readme.md | 23 ++++- .../serialize_deserialize/patch/template.h.in | 7 ++ .../serialize_deserialize/src/interface.cc | 8 +- .../src/ir_deserialize.cc | 13 ++- .../serialize_deserialize/src/patch_util.cc | 74 +++++++------- .../pir/serialize_deserialize/src/schema.cc | 40 +------- .../src/version_compat.cc | 99 ++++++++++++------- .../pir/serialize_deserialize/CMakeLists.txt | 3 - .../save_load_version_compat_test.cc | 4 +- 14 files changed, 178 insertions(+), 141 deletions(-) rename {python/paddle => paddle/fluid}/pir/serialize_deserialize/patch/1.yaml (100%) rename {python/paddle => paddle/fluid}/pir/serialize_deserialize/patch/Readme.md (87%) create mode 100644 paddle/fluid/pir/serialize_deserialize/patch/template.h.in diff --git a/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt b/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt index 3dc2ab14ff399..7e25771cdd7a3 100644 --- a/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt +++ b/paddle/fluid/pir/serialize_deserialize/CMakeLists.txt @@ -11,11 +11,28 @@ if(LINUX) link_libraries(stdc++fs) endif() -add_definitions(-DPADDLE_ROOT="${PADDLE_SOURCE_DIR}") -add_definitions( - -DPATCH_PATH="../../../../../python/paddle/pir/serialize_deserialize/patch") +file(GLOB_RECURSE YAML_PATCH_FILES "*.yaml") +# change pir version when new patches are added +add_definitions(-DDEVELOP_VERSION=1) +add_definitions(-DRELEASE_VERSION=1) +set(TEMPLATE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/patch/template.h.in) +set(PATCH_HEADER ${CMAKE_CURRENT_BINARY_DIR}/patch/patch.h) + +configure_file(${TEMPLATE_FILE} ${PATCH_HEADER} @ONLY) +file(WRITE "${PATCH_HEADER}" + "#include \n#include \n\n" + "const std::map yaml_files = {\n") + +foreach(PATCH_FILE ${YAML_PATCH_FILES}) + get_filename_component(FILENAME "${PATCH_FILE}" NAME_WE) + file(READ ${PATCH_FILE} FILE_CONTENT) + set(CONTENT "R\"(${FILE_CONTENT})\"") + file(APPEND "${PATCH_HEADER}" "{ \"${FILENAME}\", ${CONTENT} },\n") +endforeach() + +file(APPEND "${PATCH_HEADER}" "};\n") cc_library( pir_save_load - SRCS ${SERIALIZE_DESERIALIZE_CPP_SOURCES} + SRCS ${SERIALIZE_DESERIALIZE_CPP_SOURCES} ${PATCH_HEADER} DEPS op_dialect phi json yaml) diff --git a/paddle/fluid/pir/serialize_deserialize/include/patch_util.h b/paddle/fluid/pir/serialize_deserialize/include/patch_util.h index ca905bfb1179e..aa569d67e3bf3 100644 --- a/paddle/fluid/pir/serialize_deserialize/include/patch_util.h +++ b/paddle/fluid/pir/serialize_deserialize/include/patch_util.h @@ -35,6 +35,7 @@ Json ParseAttrPatches(const YAML::Node &root); Json ParseTypePatches(const YAML::Node &root); -Json YamlParser(const std::string &yaml_file); +/* Yaml file is set to be empty by default. It's only used for testing. */ +Json YamlParser(const std::string &version, const std::string &yaml_file = ""); } // namespace pir diff --git a/paddle/fluid/pir/serialize_deserialize/include/schema.h b/paddle/fluid/pir/serialize_deserialize/include/schema.h index a024226feeb0e..171f29c8bf10c 100644 --- a/paddle/fluid/pir/serialize_deserialize/include/schema.h +++ b/paddle/fluid/pir/serialize_deserialize/include/schema.h @@ -82,9 +82,17 @@ namespace pir { #define NULL_TYPE "NULL" // special op compress - #define PARAMETEROP "p" +// actions for patch +#define DELETE "DEL" +#define ADD "ADD" +#define UPDATE "UPD" +#define NEW_NAME "NN" +#define ADD_ATTRS "ADD_A" +#define ADD_OPRESULTS_ATTRS "ADD_OA" +#define PATCH "patch" + std::pair GetContentSplitByDot( const std::string& str); @@ -109,6 +117,4 @@ class DialectIdMap { std::unordered_map DecompressDialect; }; -uint64_t GetPirVersion(); -uint64_t GetMaxReleasePirVersion(); } // namespace pir diff --git a/paddle/fluid/pir/serialize_deserialize/include/version_compat.h b/paddle/fluid/pir/serialize_deserialize/include/version_compat.h index 97f2b3882fc2f..e11a9d2ac11e6 100644 --- a/paddle/fluid/pir/serialize_deserialize/include/version_compat.h +++ b/paddle/fluid/pir/serialize_deserialize/include/version_compat.h @@ -34,9 +34,11 @@ class PatchBuilder { PatchBuilder& operator=(const PatchBuilder&) = delete; PatchBuilder& operator=(PatchBuilder&&); - void IR_API BuildPatch(const std::string& path, - uint64_t pir_version, - uint64_t max_version); + /* Patch patch is set to empty by default. It is only used for testing. + */ + void IR_API BuildPatch(uint64_t pir_version, + uint64_t max_version, + const std::string& path = ""); /* If file_version != pir_vefrsion, set file_version for finding patch yamls. */ void SetFileVersion(const uint64_t version) { file_version_ = version; } diff --git a/python/paddle/pir/serialize_deserialize/patch/1.yaml b/paddle/fluid/pir/serialize_deserialize/patch/1.yaml similarity index 100% rename from python/paddle/pir/serialize_deserialize/patch/1.yaml rename to paddle/fluid/pir/serialize_deserialize/patch/1.yaml diff --git a/python/paddle/pir/serialize_deserialize/patch/Readme.md b/paddle/fluid/pir/serialize_deserialize/patch/Readme.md similarity index 87% rename from python/paddle/pir/serialize_deserialize/patch/Readme.md rename to paddle/fluid/pir/serialize_deserialize/patch/Readme.md index ba48fb434c7b0..864d51de42f4e 100644 --- a/python/paddle/pir/serialize_deserialize/patch/Readme.md +++ b/paddle/fluid/pir/serialize_deserialize/patch/Readme.md @@ -140,11 +140,24 @@ type_patches: ``` ## pir_version 配置说明 -### C++端版本号管理 -- 版本号管理在C++端,通过宏PIR_VERSION进行管理。 -- pir_version 定义PIR的版本迭代,每次PIR进行更新并新增patch文件后,版本号会顺序递增。与Paddle的主版本号解耦,可以独立迭代。 -- 定义GetPirVersion函数获取当前的版本号:在"paddle/fluid/pir/serialize_deserialize/patch"路径下进行yaml文件查询,如果存在"0.yaml"则为develop版本,pir_verison为0;否则查找到的yaml文件名最大值即为当前的pir_version。 -- ReadModule和WriteModule参数中的pir_version设为默认值,可以不用传递。pir_version 函数默认值为0,为develop版本下的值,进入函数后会获取当前的版本号。 +### C++端版本号管理与CMake配置 +- 版本号管理在C++端,在CMakeList.txt中配置。 +- PIR版本号定义PIR的版本迭代,版本号与yaml文件名强相关。每次PIR进行更新并新增patch文件后,patch文件名顺序递增,版本号同时顺序递增。与Paddle的主版本号解耦,可以独立迭代。 + ```cmake + # change pir version when new patches are added + add_definitions(-DDEVELOP_VERSION=1) + add_definitions(-DRELEASE_VERSION=1) + ``` + + ```tree + ├─patch + │ ├─0.yaml + │ └─1.yaml + ``` + - RELEASE_VERSION 为已发布的版本中PIR版本号,即为patch yaml文件名的最大值。 + - DEVELOP_VERSION 为当前develop分支下的PIR版本号,若存在未发布的新增patch,配置在`0.yaml`中,且当前的develop pir 版本号为0。 + +- ReadModule和WriteModule参数中的pir_version设为默认值,可以不用传递。pir_version 函数默认值为-1,进入函数后会获取CMake中配置的当前的PIR版本号。 ### Python端 - Paddle的主版本号定义在Python端,与PIR version不产生关联。Python端不再需要获取和传入pir_version,直接使用默认值即可。 diff --git a/paddle/fluid/pir/serialize_deserialize/patch/template.h.in b/paddle/fluid/pir/serialize_deserialize/patch/template.h.in new file mode 100644 index 0000000000000..05886e7e55cad --- /dev/null +++ b/paddle/fluid/pir/serialize_deserialize/patch/template.h.in @@ -0,0 +1,7 @@ +#pragma once +#include +#include + +const std::map yaml_files = { + @FILE_CONTENTS@ +}; diff --git a/paddle/fluid/pir/serialize_deserialize/src/interface.cc b/paddle/fluid/pir/serialize_deserialize/src/interface.cc index a3d94be777786..adb60536a7787 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/interface.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/interface.cc @@ -72,7 +72,7 @@ bool ReadModule(const std::string& file_path, std::ifstream f(file_path); Json data = Json::parse(f); if (pir_version < 0) { - pir_version = GetPirVersion(); + pir_version = DEVELOP_VERSION; VLOG(6) << "pir_version is null, get pir_version: " << pir_version; } @@ -85,7 +85,7 @@ bool ReadModule(const std::string& file_path, if (file_version != (uint64_t)pir_version) { builder.SetFileVersion(file_version); // Set max_version to the max version number of release pir plus 1. - auto max_version = GetMaxReleasePirVersion() + 1; + auto max_version = RELEASE_VERSION + 1; // If pir_version_ is not 0, we will build patch from file_version_ to // pir_version_; If pir_version_ is 0, we will first build patch from // file_version_ to max_version, and then add 0.yaml to the end. @@ -93,9 +93,7 @@ bool ReadModule(const std::string& file_path, VLOG(6) << "file_version: " << file_version << ", pir_version: " << pir_version << ", final_version: " << version; - std::filesystem::path patch_path = std::filesystem::path(PATCH_PATH); - VLOG(8) << "Patch path: " << patch_path; - builder.BuildPatch(patch_path.string(), version, max_version); + builder.BuildPatch(version, max_version); } } else { PADDLE_THROW(common::errors::InvalidArgument("Invalid model file.")); diff --git a/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc b/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc index b33500d1706cd..0c1993d556bd4 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc @@ -240,9 +240,8 @@ pir::Operation* ProgramReader::ReadOp(Json* op_json) { VLOG(8) << op_name << " has been patched: " << *op_json; // Apply patch to op name // This happens when changing an op into another dialect - if (op_patch.contains("NEW_NAME")) { - std::string new_name = - op_patch.at("NEW_NAME").template get(); + if (op_patch.contains(NEW_NAME)) { + std::string new_name = op_patch.at(NEW_NAME).template get(); VLOG(8) << "change op name from " << op_name << " to " << new_name; op_name = new_name; op_json->at(ID) = op_name; @@ -334,8 +333,8 @@ pir::AttributeMap ProgramReader::ReadAttributesMap( const std::unordered_map& attr_patch) { pir::AttributeMap attributes; // Add new attribute from patch - if (attr_patch.count("A_ADD")) { - for (auto& attr_json : attr_patch.at("A_ADD")) { + if (attr_patch.count(ADD_ATTRS)) { + for (auto& attr_json : attr_patch.at(ADD_ATTRS)) { attrs_json->insert(attrs_json->end(), attr_json); } VLOG(8) << "attr has been added: " << *attrs_json; @@ -358,8 +357,8 @@ pir::AttributeMap ProgramReader::ReadAttributesMap( } VLOG(6) << "Finish Read pir::AttributeMap."; // Add new opresult attribute from patch - if (attr_patch.count("OA_ADD")) { - for (auto& attr_json : attr_patch.at("OA_ADD")) { + if (attr_patch.count(ADD_OPRESULTS_ATTRS)) { + for (auto& attr_json : attr_patch.at(ADD_OPRESULTS_ATTRS)) { opresult_attrs_json->insert(opresult_attrs_json->end(), attr_json); } VLOG(8) << "opresult attr has been added: " << *opresult_attrs_json; diff --git a/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc b/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc index 7c68149d51fb8..91d5f37db2426 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc @@ -20,6 +20,7 @@ #include #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/serialize_deserialize/include/schema.h" +#include "paddle/fluid/pir/serialize_deserialize/patch/patch.h" #include "paddle/phi/common/data_type.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/builtin_type.h" @@ -262,8 +263,8 @@ Json ParseOpPairPatches(const YAML::Node &root) { VLOG(8) << "Op_pair_name: " << name; j_patch["op_pair"].push_back(op_name); } - j_patch["patch"] = Json::object(); - j_patch["patch"]["op_pair"] = j_patch["op_pair"]; + j_patch[PATCH] = Json::object(); + j_patch[PATCH]["op_pair"] = j_patch["op_pair"]; // parse actions auto actions = node["actions"]; for (size_t j = 0; j < actions.size(); j++) { @@ -278,20 +279,20 @@ Json ParseOpPairPatches(const YAML::Node &root) { Json j_add_out; j_add_out[ID] = out_id; j_add_out[TYPE_TYPE] = BuildTypeJsonPatch(action); - j_patch["patch"][OPRESULTS]["ADD"].push_back(j_add_out); + j_patch[PATCH][OPRESULTS][ADD].push_back(j_add_out); Json j_add_in; j_add_in[ID] = in_id; - j_patch["patch"][OPOPERANDS]["ADD"].push_back(j_add_in); + j_patch[PATCH][OPOPERANDS][ADD].push_back(j_add_in); } else if (action_name == "delete_value") { VLOG(8) << "Patch for deleting values."; int out_id = action["object"][0].as(); int in_id = action["object"][1].as(); Json j_del_out; j_del_out[ID] = out_id; - j_patch["patch"][OPRESULTS]["DELETE"].push_back(j_del_out); + j_patch[PATCH][OPRESULTS][DELETE].push_back(j_del_out); Json j_del_in; j_del_in[ID] = in_id; - j_patch["patch"][OPOPERANDS]["DELETE"].push_back(j_del_in); + j_patch[PATCH][OPOPERANDS][DELETE].push_back(j_del_in); } } json_patch.push_back(j_patch); @@ -314,7 +315,7 @@ Json ParseOpPatches(const YAML::Node &root) { VLOG(8) << "Parse patches for " << op_name; Json j_patch; j_patch["op_name"] = op_name; - j_patch["patch"] = Json::object(); + j_patch[PATCH] = Json::object(); // parse actions auto actions = node["actions"]; @@ -335,10 +336,10 @@ Json ParseOpPatches(const YAML::Node &root) { j_attr[ATTR_TYPE] = BuildAttrJsonPatch(action); if (action_name == "add_attr") { Json j_add = Json::object(); - j_add["ADD"] = j_attr; - j_patch["patch"][ATTRS].push_back(j_add); + j_add[ADD] = j_attr; + j_patch[PATCH][ATTRS].push_back(j_add); } else { - j_patch["patch"][ATTRS].push_back(j_attr); + j_patch[PATCH][ATTRS].push_back(j_attr); } } else if (action_name == "add_output_attr" || action_name == "modify_output_attr" || @@ -350,10 +351,10 @@ Json ParseOpPatches(const YAML::Node &root) { j_attr[ATTR_TYPE] = BuildAttrJsonPatch(action); if (action_name == "add_output_attr") { Json j_add = Json::object(); - j_add["ADD"] = j_attr; - j_patch["patch"][OPRESULTS_ATTRS].push_back(j_add); + j_add[ADD] = j_attr; + j_patch[PATCH][OPRESULTS_ATTRS].push_back(j_add); } else { - j_patch["patch"][OPRESULTS_ATTRS].push_back(j_attr); + j_patch[PATCH][OPRESULTS_ATTRS].push_back(j_attr); } } else if (action_name == "modify_attr_name" || action_name == "modify_output_attr_name") { @@ -362,35 +363,35 @@ Json ParseOpPatches(const YAML::Node &root) { std::string new_name = action["default"].as(); Json j_attr; j_attr[NAME] = old_name; - j_attr["NEW_NAME"] = new_name; + j_attr[NEW_NAME] = new_name; std::string col = action_name == "modify_attr_name" ? ATTRS : OPRESULTS_ATTRS; - j_patch["patch"][col].push_back(j_attr); + j_patch[PATCH][col].push_back(j_attr); } else if (action_name == "delete_input") { VLOG(8) << "Patch for delete_input"; Json j_input; int op_id = action["object"].as(); j_input[ID] = op_id; - j_patch["patch"][OPOPERANDS]["DELETE"].push_back(j_input); + j_patch[PATCH][OPOPERANDS][DELETE].push_back(j_input); } else if (action_name == "add_output") { VLOG(8) << "Patch for add_output"; Json j_output; int op_id = action["object"].as(); j_output[ID] = op_id; j_output[TYPE_TYPE] = BuildTypeJsonPatch(action); - j_patch["patch"][OPRESULTS]["ADD"].push_back(j_output); + j_patch[PATCH][OPRESULTS][ADD].push_back(j_output); } else if (action_name == "modify_output_type") { VLOG(8) << "Patch for modify_output_type"; int op_id = action["object"].as(); Json j_type; j_type[ID] = op_id; j_type[TYPE_TYPE] = BuildTypeJsonPatch(action); - j_patch["patch"][OPRESULTS]["UPDATE"].push_back(j_type); + j_patch[PATCH][OPRESULTS][UPDATE].push_back(j_type); } else if (action_name == "modify_name") { VLOG(8) << "Patch for modify_name"; std::string op_name = action["default"].as(); GetCompressOpName(&op_name); - j_patch["patch"]["NEW_NAME"] = op_name; + j_patch[PATCH][NEW_NAME] = op_name; } } json_patch.push_back(j_patch); @@ -410,15 +411,15 @@ Json ParseTypePatches(const YAML::Node &root) { VLOG(8) << "Type name after compressing: " << type_name; Json j_patch; j_patch["type_name"] = type_name; - j_patch["patch"] = Json::object(); + j_patch[PATCH] = Json::object(); auto actions = node["actions"]; for (size_t j = 0; j < actions.size(); j++) { YAML::Node action = actions[j]; std::string action_name = action["action"].as(); if (action_name == "modify_name") { - j_patch["patch"]["NEW_NAME"] = GetTypeName(action); + j_patch[PATCH][NEW_NAME] = GetTypeName(action); } else if (action_name == "delete_type") { - j_patch["patch"]["NEW_NAME"] = ""; + j_patch[PATCH][NEW_NAME] = ""; } } json_patch.push_back(j_patch); @@ -439,15 +440,15 @@ Json ParseAttrPatches(const YAML::Node &root) { VLOG(8) << attr_name; Json j_patch; j_patch["attr_name"] = attr_name; - j_patch["patch"] = Json::object(); + j_patch[PATCH] = Json::object(); auto actions = node["actions"]; for (size_t j = 0; j < actions.size(); j++) { YAML::Node action = actions[j]; std::string action_name = action["action"].as(); if (action_name == "modify_name") { - j_patch["patch"]["NEW_NAME"] = GetAttrName(action); + j_patch[PATCH][NEW_NAME] = GetAttrName(action); } else if (action_name == "delete_attr") { - j_patch["patch"]["NEW_NAME"] = ""; + j_patch[PATCH][NEW_NAME] = ""; } } json_patch.push_back(j_patch); @@ -455,15 +456,20 @@ Json ParseAttrPatches(const YAML::Node &root) { return json_patch; } -Json YamlParser(const std::string &yaml_file) { +Json YamlParser(const std::string &version, const std::string &yaml_file) { + YAML::Node root; std::ifstream fin; - VLOG(8) << yaml_file; - fin.open(yaml_file); - if (!fin) { - VLOG(8) << yaml_file << " is not fin and return empty. "; - return Json::object(); + if (yaml_file.empty()) { + root = YAML::Load(yaml_files.at(version)); + } else { + VLOG(8) << yaml_file; + fin.open(yaml_file); + if (!fin) { + VLOG(8) << yaml_file << " is not fin and return empty. "; + return Json::object(); + } + root = YAML::Load(fin); } - YAML::Node root = YAML::Load(fin); Json json_patch; if (!root.IsDefined()) { VLOG(8) << "Not defined"; @@ -484,7 +490,9 @@ Json YamlParser(const std::string &yaml_file) { Yaml attr_patch = root["attr_patches"]; json_patch["attr_patches"] = ParseAttrPatches(attr_patch); VLOG(8) << "Finish attr json_patch: " << json_patch; - fin.close(); + if (fin) { + fin.close(); + } return json_patch; } } // namespace pir diff --git a/paddle/fluid/pir/serialize_deserialize/src/schema.cc b/paddle/fluid/pir/serialize_deserialize/src/schema.cc index 80761474a04cd..3f5f29f04969f 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/schema.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/schema.cc @@ -56,8 +56,8 @@ DialectIdMap::DialectIdMap() { insert(paddle::dialect::CustomOpDialect::name(), "3"); insert(paddle::dialect::DistDialect::name(), "4"); // TestDialect for test use - insert(test::TestDialect::name(), "5"); - insert(test1::Test1Dialect::name(), "6"); + insert(test::TestDialect::name(), "-1"); + insert(test1::Test1Dialect::name(), "-2"); } void DialectIdMap::insert(const std::string& key, const std::string& value) { CompressDialect[key] = value; @@ -88,40 +88,4 @@ std::string DialectIdMap::GetDecompressDialectId(const std::string& id) { } return ""; } - -uint64_t GetPirVersion() { - VLOG(8) << "Get PIR Version: "; - std::filesystem::path patch_path = std::filesystem::path(PATCH_PATH); - VLOG(8) << "Patch path: " << patch_path; - int version = 0; - for (auto& v : std::filesystem::directory_iterator(patch_path)) { - std::string filename = v.path().filename().string(); - std::string extension_name = v.path().extension().string(); - // 0.yaml for develop version - if (filename == "0.yaml") { - VLOG(8) << "Develop version: " << version; - return 0; - } else if (extension_name == ".yaml") { - version = stoi(filename) > version ? stoi(filename) : version; - } - } - VLOG(8) << "PIR version: " << version; - return version; -} -uint64_t GetMaxReleasePirVersion() { - std::filesystem::path patch_path = std::filesystem::path(PATCH_PATH); - VLOG(8) << "Patch path: " << patch_path; - int version = 0; - for (auto& v : std::filesystem::directory_iterator(patch_path)) { - std::string filename = v.path().filename().string(); - std::string extension_name = v.path().extension().string(); - VLOG(8) << filename; - if (extension_name == ".yaml") { - version = stoi(filename) > version ? stoi(filename) : version; - } - } - VLOG(8) << "Max Release PIR version: " << version; - return version; -} - } // namespace pir diff --git a/paddle/fluid/pir/serialize_deserialize/src/version_compat.cc b/paddle/fluid/pir/serialize_deserialize/src/version_compat.cc index 8e6b17e81a6ed..14409dd6f9d18 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/version_compat.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/version_compat.cc @@ -14,28 +14,34 @@ #include "paddle/fluid/pir/serialize_deserialize/include/version_compat.h" #include -#include #include "paddle/fluid/pir/serialize_deserialize/include/patch_util.h" namespace pir { -void PatchBuilder::BuildPatch(const std::string& path, - uint64_t pir_version, - uint64_t max_version) { +void PatchBuilder::BuildPatch(uint64_t pir_version, + uint64_t max_version, + const std::string& path) { VLOG(6) << "Begin building patches... "; for (auto v = file_version_; v <= pir_version; v++) { - std::filesystem::path p(path.c_str()); - std::filesystem::path patch_path = p / std::to_string(v % max_version); - patch_path += ".yaml"; - VLOG(8) << "Patch file: " << patch_path; - patch_json = YamlParser(patch_path.string()); + std::string file_path = ""; + std::string file_name = std::to_string(v % max_version); + if (!path.empty()) { + std::filesystem::path p(path.c_str()); + std::filesystem::path patch_path = p / file_name; + patch_path += ".yaml"; + VLOG(8) << "Patch file: " << patch_path; + file_path = patch_path.string(); + } + patch_json = YamlParser(file_name, file_path); VLOG(8) << "Build version " << v << " patch: " << patch_json; for (auto patch_info : patch_json["op_pair_patches"]) { - op_pair_patches_.insert(patch_info["patch"]); + op_pair_patches_.insert(patch_info[PATCH]); + VLOG(8) << "merge op_pair patch: " << op_pair_patches_; } for (auto patch_info : patch_json["op_patches"]) { + VLOG(8) << "merge op patch: " << patch_info["op_name"]; if (op_patches_.count(patch_info["op_name"])) { Json op_patch_orig = op_patches_[patch_info["op_name"]]; - Json op_patch_new = patch_info["patch"]; + Json op_patch_new = patch_info[PATCH]; for (auto item : op_patch_new.items()) { std::string key = item.key(); Json value = item.value(); @@ -43,18 +49,37 @@ void PatchBuilder::BuildPatch(const std::string& path, op_patch_orig[key] = value; } else { Json value_orig = op_patch_orig[key]; - value_orig.insert(value_orig.end(), value.begin(), value.end()); + if (key == OPOPERANDS || key == OPRESULTS) { + for (auto action : value.items()) { + std::string action_key = action.key(); + Json action_value = action.value(); + if (value_orig.count(action_key) == 0) { + value_orig[action_key] = action_value; + } else { + value_orig[action_key].insert(value_orig[action_key].end(), + action_value.begin(), + action_value.end()); + } + } + } else if (key == NEW_NAME) { + value_orig = value; + } else { + value_orig.insert(value_orig.end(), value.begin(), value.end()); + } } } + VLOG(8) << "merge op patch: " << op_patches_[patch_info["op_name"]]; } else { - op_patches_[patch_info["op_name"]] = patch_info["patch"]; + op_patches_[patch_info["op_name"]] = patch_info[PATCH]; } } for (auto patch_info : patch_json["type_patches"]) { - type_patches_[patch_info["type_name"]].update(patch_info["patch"], true); + type_patches_[patch_info["type_name"]].update(patch_info[PATCH], true); + VLOG(8) << "merge type patch: " << type_patches_; } for (auto patch_info : patch_json["attr_patches"]) { - attr_patches_[patch_info["attr_name"]].update(patch_info["patch"], true); + attr_patches_[patch_info["attr_name"]].update(patch_info[PATCH], true); + VLOG(8) << "merge attr patch: " << attr_patches_; } } VLOG(8) << "Finish build op_pair_patches_: " << op_pair_patches_; @@ -69,8 +94,8 @@ std::unordered_map PatchBuilder::GetOpAttrPatchMap( std::unordered_map op_attr_patch; if (op_patch.count(ATTRS)) { for (Json item : op_patch[ATTRS]) { - if (item.count("ADD")) { - op_attr_patch["A_ADD"].push_back(item.at("ADD")); + if (item.count(ADD)) { + op_attr_patch[ADD_ATTRS].push_back(item.at(ADD)); } else { op_attr_patch[item[NAME]].push_back(item); } @@ -78,8 +103,8 @@ std::unordered_map PatchBuilder::GetOpAttrPatchMap( } if (op_patch.count(OPRESULTS_ATTRS)) { for (Json item : op_patch[OPRESULTS_ATTRS]) { - if (item.count("ADD")) { - op_attr_patch["OA_ADD"].push_back(item.at("ADD")); + if (item.count(ADD)) { + op_attr_patch[ADD_OPRESULTS_ATTRS].push_back(item.at(ADD)); } else { op_attr_patch[item[NAME]].push_back(item); } @@ -97,16 +122,16 @@ void PatchBuilder::ApplyOpPairPatches(int64_t* id) { std::string op2 = item["op_pair"][1]; Json op1_patch = item[OPRESULTS]; Json op2_patch = item[OPOPERANDS]; - for (uint64_t i = 0; i < op1_patch["ADD"].size(); ++i) { + for (uint64_t i = 0; i < op1_patch[ADD].size(); ++i) { max_id++; - op1_patch["ADD"][i][VALUE_ID] = max_id; - op2_patch["ADD"][i][VALUE_ID] = max_id; - op_patches_[op1][OPRESULTS]["ADD"].push_back(op1_patch["ADD"][i]); - op_patches_[op2][OPOPERANDS]["ADD"].push_back(op2_patch["ADD"][i]); + op1_patch[ADD][i][VALUE_ID] = max_id; + op2_patch[ADD][i][VALUE_ID] = max_id; + op_patches_[op1][OPRESULTS][ADD].push_back(op1_patch[ADD][i]); + op_patches_[op2][OPOPERANDS][ADD].push_back(op2_patch[ADD][i]); } - for (uint64_t i = 0; i < op1_patch["DELETE"].size(); ++i) { - op_patches_[op1][OPRESULTS]["DELETE"].push_back(op1_patch["DELETE"][i]); - op_patches_[op2][OPOPERANDS]["DELETE"].push_back(op2_patch["DELETE"][i]); + for (uint64_t i = 0; i < op1_patch[DELETE].size(); ++i) { + op_patches_[op1][OPRESULTS][DELETE].push_back(op1_patch[DELETE][i]); + op_patches_[op2][OPOPERANDS][DELETE].push_back(op2_patch[DELETE][i]); } } *id = max_id; @@ -164,7 +189,7 @@ void PatchBuilder::ApplyOpPatches(const std::string& op_name, if (patch.contains(OPOPERANDS)) { Json* json_in = &json->at(OPOPERANDS); Json in_patch = patch[OPOPERANDS]; - for (auto item : in_patch["ADD"]) { + for (auto item : in_patch[ADD]) { int id = item[ID].get(); auto index = json_in->begin() + id; VLOG(8) << "Add index: " << id; @@ -172,7 +197,7 @@ void PatchBuilder::ApplyOpPatches(const std::string& op_name, json_in->insert(index, item); VLOG(8) << "ADD output: " << json_in; } - for (auto item : in_patch["DELETE"]) { + for (auto item : in_patch[DELETE]) { int id = item[ID].get(); json_in->erase(id); } @@ -181,12 +206,12 @@ void PatchBuilder::ApplyOpPatches(const std::string& op_name, Json* json_out = &json->at(OPRESULTS); Json out_patch = patch[OPRESULTS]; VLOG(8) << "out patch: " << out_patch; - for (auto item : out_patch["UPDATE"]) { + for (auto item : out_patch[UPDATE]) { int id = item[ID].get(); item.erase(ID); json_out->at(id)[TYPE_TYPE] = item[TYPE_TYPE]; } - for (auto item : out_patch["ADD"]) { + for (auto item : out_patch[ADD]) { int id = item[ID].get(); auto index = json_out->begin() + id; VLOG(8) << "Add index: " << id; @@ -194,7 +219,7 @@ void PatchBuilder::ApplyOpPatches(const std::string& op_name, json_out->insert(index, item); VLOG(8) << "ADD output: " << json_out; } - for (auto item : out_patch["DELETE"]) { + for (auto item : out_patch[DELETE]) { int id = item[ID].get(); json_out->erase(id); } @@ -204,7 +229,7 @@ void PatchBuilder::ApplyOpPatches(const std::string& op_name, void PatchBuilder::ApplyTypePatches(const std::string& type_name, Json* json, Json patch) { - json->at(ID) = patch["NEW_NAME"]; + json->at(ID) = patch[NEW_NAME]; if (type_name == pir::DenseTensorType::name()) { std::string name = json->at(DATA).at(0).get(); if (HasTypePatch(name)) { @@ -219,8 +244,8 @@ void PatchBuilder::ApplyAttrPatches(const std::string& attr_name, Json patch) { std::string name = attr_name; for (auto item : patch) { - if (item.contains("NEW_NAME")) { - name = item["NEW_NAME"].get(); + if (item.contains(NEW_NAME)) { + name = item[NEW_NAME].get(); } else { json->merge_patch(item); } @@ -231,8 +256,8 @@ void PatchBuilder::ApplyAttrPatches(const std::string& attr_name, void PatchBuilder::ApplyAttrTypePatches(const std::string& attr_name, Json* json, Json patch) { - if (patch.contains("NEW_NAME")) { - json->at(ID) = patch["NEW_NAME"]; + if (patch.contains(NEW_NAME)) { + json->at(ID) = patch[NEW_NAME]; } } } // namespace pir diff --git a/test/cpp/pir/serialize_deserialize/CMakeLists.txt b/test/cpp/pir/serialize_deserialize/CMakeLists.txt index 3f62004369007..f7df24a66990d 100644 --- a/test/cpp/pir/serialize_deserialize/CMakeLists.txt +++ b/test/cpp/pir/serialize_deserialize/CMakeLists.txt @@ -2,9 +2,6 @@ paddle_test(test_builtin_parameter SRCS test_builtin_parameter.cc) paddle_test(save_load_version_compat_test SRCS save_load_version_compat_test.cc DEPS test_dialect) -copy_if_different(${CMAKE_CURRENT_SOURCE_DIR}/patch - ${CMAKE_CURRENT_BINARY_DIR}/patch) - if(WITH_ONNXRUNTIME AND WIN32) # Copy onnxruntime for some c++ test in Windows, since the test will # be build only in CI, so suppose the generator in Windows is Ninja. diff --git a/test/cpp/pir/serialize_deserialize/save_load_version_compat_test.cc b/test/cpp/pir/serialize_deserialize/save_load_version_compat_test.cc index 77d759d2b133c..c1b571c8139f6 100644 --- a/test/cpp/pir/serialize_deserialize/save_load_version_compat_test.cc +++ b/test/cpp/pir/serialize_deserialize/save_load_version_compat_test.cc @@ -69,7 +69,7 @@ TEST(save_load_version_compat, op_patch_test) { builder.SetFileVersion(1); std::filesystem::path patch_path("patch"); VLOG(8) << "Patch path: " << patch_path; - builder.BuildPatch(patch_path.string(), 2, 2); + builder.BuildPatch(2, 2, patch_path.string()); } bool ReadModuleForTest(const std::string &file_path, @@ -87,7 +87,7 @@ bool ReadModuleForTest(const std::string &file_path, builder.SetFileVersion(file_version); std::filesystem::path patch_path("patch"); VLOG(8) << "Patch path: " << patch_path; - builder.BuildPatch(patch_path.string(), 2, 2); + builder.BuildPatch(2, 2, patch_path.string()); } } else { PADDLE_THROW(::common::errors::InvalidArgument("Invalid model file: %s.", From 0b09f9dcadc7df9fc902e6479ea85dc089353187 Mon Sep 17 00:00:00 2001 From: Lucas Date: Sat, 28 Sep 2024 16:00:19 +0800 Subject: [PATCH 003/135] XPU, update xhpc to fix reduce_max bug when [1, t, n] --> [1, 1, n] && t>2048 && n < 128 && n%16!=0 (#68504) --- cmake/external/xpu.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index e317e96dcbbee..786975582270f 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -30,7 +30,7 @@ if(NOT DEFINED XPU_XRE_BASE_VERSION) set(XPU_XRE_BASE_VERSION "4.32.0.1") endif() if(NOT DEFINED XPU_XHPC_BASE_DATE) - set(XPU_XHPC_BASE_DATE "eb35/20240926") + set(XPU_XHPC_BASE_DATE "eb35/20240927") endif() set(XPU_XCCL_BASE_VERSION "1.2.11e") if(NOT DEFINED XPU_XFT_BASE_VERSION) From b01105c4d44853e05d871c963029b447fa780503 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sat, 28 Sep 2024 21:16:53 +0800 Subject: [PATCH 004/135] [CINN]fix flatten decomp bug and some forward only op (#68505) * fix flatten decomp bug * add forward only trait * fix bug --- paddle/fluid/primitive/composite/composite.h | 8 ++++++++ paddle/phi/ops/yaml/legacy/static_ops.yaml | 1 + paddle/phi/ops/yaml/ops.yaml | 10 +++++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index d72bbaaa53ed2..9887ea1836092 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -1125,6 +1125,14 @@ Tensor flatten_decomp(const Tensor& x, int start_axis, int end_axis) { start_axis = 0; end_axis = 0; } + if (start_axis < 0) { + start_axis += x_dim.size(); + } + + if (end_axis < 0) { + end_axis += x_dim.size(); + } + if (end_axis < start_axis) { PADDLE_THROW(common::errors::Unimplemented( "end_axis must be greater than or equal to start_axis.")); diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml index fc0e89d1db7d8..00a441fffae68 100755 --- a/paddle/phi/ops/yaml/legacy/static_ops.yaml +++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml @@ -966,6 +966,7 @@ kernel : func : unique data_type : x + traits : paddle::dialect::ForwardOnlyTrait - op : unpool args: (Tensor x, Tensor indices, int[] ksize, str unpooling_type, int[] strides = {1,1}, int[] paddings ={0,0} ,IntArray output_size = {0,0}, str data_format="NCHW") diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 4b4f3b7ba2623..0d9e146b400a7 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -202,6 +202,7 @@ func : allclose data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : amax args : (Tensor x, int64_t[] axis={}, bool keepdim=false) @@ -550,6 +551,7 @@ func: bincount optional: weights interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : binomial args : (Tensor count, Tensor prob) @@ -1601,6 +1603,7 @@ kernel : func : equal_all interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : erf args : (Tensor x) @@ -2433,6 +2436,7 @@ kernel : func : histogram interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : hsigmoid_loss args : (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool is_sparse) @@ -2630,6 +2634,7 @@ func : IsEmptyInferMeta kernel : func : is_empty + traits : paddle::dialect::ForwardOnlyTrait - op : isclose args : (Tensor x, Tensor y, Scalar(double) rtol=1e-5, Scalar(double) atol=1e-8, bool equal_nan=false) @@ -2641,6 +2646,7 @@ func : isclose data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : isfinite args : (Tensor x) @@ -2651,6 +2657,7 @@ func : isfinite {dense -> dense}, isfinite_sr {selected_rows -> selected_rows} interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : isinf args : (Tensor x) @@ -2826,6 +2833,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : llm_int8_linear args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, float threshold=6.0) @@ -3471,7 +3479,6 @@ output : Tensor(out) infer_meta : func : NonZeroInferMeta - spmd_rule: NonZeroInferSpmd kernel : func : nonzero data_type: condition @@ -4158,6 +4165,7 @@ func : searchsorted data_type : sorted_sequence interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : segment_pool args : (Tensor x, Tensor segment_ids, str pooltype="SUM") From 1a08c9deda8adafc5900c61975a031a412bdfda0 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Sun, 29 Sep 2024 10:06:01 +0800 Subject: [PATCH 005/135] [PIR] fix pir disable case (#68386) * fix pir disable case * refine * refine * refine * refine * refine --- test/deprecated/legacy_test/CMakeLists.txt | 6 + ...est_fuse_gemm_epilogue_pass_deprecated.py} | 0 .../test_eager_blacklist_flag_deprecated.py} | 0 .../prim/prim/vjp/static/CMakeLists.txt | 8 + .../static/test_comp_add_grad_deprecated.py} | 0 .../test_comp_add_tanh_grad_deprecated.py} | 0 .../test_comp_batch_norm_grad_deprecated.py} | 0 .../static/test_comp_cast_grad_deprecated.py} | 0 .../test_comp_cumprod_grad_deprecated.py | 213 ++++++++++++++++++ .../static/test_comp_div_grad_deprecated.py} | 0 .../prim/vjp/static/test_comp_exp_grad.py | 0 .../test_comp_expand_grad_deprecated.py} | 0 ...est_comp_matmul_double_grad_deprecated.py} | 0 .../test_comp_multiply_grad_deprecated.py} | 0 .../test_comp_reshape_grad_deprecated.py} | 0 .../test_comp_sigmoid_grad_deprecated.py} | 0 .../prim/vjp/static/test_comp_sqrt_grad.py | 0 .../static/test_comp_sub_grad_deprecated.py} | 0 .../static/test_comp_sum_grad_deprecated.py} | 0 .../prim/vjp/static/test_comp_tanh_grad.py | 0 .../test_comp_transpose_grad_deprecated.py} | 0 test/dygraph_to_static/CMakeLists.txt | 2 - test/legacy_test/CMakeLists.txt | 3 - test/legacy_test/test_fuse_bn_add_act_pass.py | 40 ++-- .../legacy_test/test_raw_program_optimizer.py | 49 ++-- test/prim/prim/CMakeLists.txt | 1 - test/prim/prim/flags/CMakeLists.txt | 9 - test/prim/prim/vjp/CMakeLists.txt | 1 - test/prim/prim/vjp/static/CMakeLists.txt | 18 -- tools/windows/run_unittests.sh | 2 +- 30 files changed, 274 insertions(+), 78 deletions(-) rename test/{legacy_test/test_fuse_gemm_epilogue_pass.py => deprecated/legacy_test/test_fuse_gemm_epilogue_pass_deprecated.py} (100%) rename test/{prim/prim/flags/test_eager_blacklist_flag.py => deprecated/prim/prim/flags/test_eager_blacklist_flag_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_add_grad.py => deprecated/prim/prim/vjp/static/test_comp_add_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_add_tanh_grad.py => deprecated/prim/prim/vjp/static/test_comp_add_tanh_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_batch_norm_grad.py => deprecated/prim/prim/vjp/static/test_comp_batch_norm_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_cast_grad.py => deprecated/prim/prim/vjp/static/test_comp_cast_grad_deprecated.py} (100%) create mode 100644 test/deprecated/prim/prim/vjp/static/test_comp_cumprod_grad_deprecated.py rename test/{prim/prim/vjp/static/test_comp_div_grad.py => deprecated/prim/prim/vjp/static/test_comp_div_grad_deprecated.py} (100%) rename test/{ => deprecated}/prim/prim/vjp/static/test_comp_exp_grad.py (100%) rename test/{prim/prim/vjp/static/test_comp_expand_grad.py => deprecated/prim/prim/vjp/static/test_comp_expand_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_matmul_double_grad.py => deprecated/prim/prim/vjp/static/test_comp_matmul_double_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_multiply_grad.py => deprecated/prim/prim/vjp/static/test_comp_multiply_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_reshape_grad.py => deprecated/prim/prim/vjp/static/test_comp_reshape_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_sigmoid_grad.py => deprecated/prim/prim/vjp/static/test_comp_sigmoid_grad_deprecated.py} (100%) rename test/{ => deprecated}/prim/prim/vjp/static/test_comp_sqrt_grad.py (100%) rename test/{prim/prim/vjp/static/test_comp_sub_grad.py => deprecated/prim/prim/vjp/static/test_comp_sub_grad_deprecated.py} (100%) rename test/{prim/prim/vjp/static/test_comp_sum_grad.py => deprecated/prim/prim/vjp/static/test_comp_sum_grad_deprecated.py} (100%) rename test/{ => deprecated}/prim/prim/vjp/static/test_comp_tanh_grad.py (100%) rename test/{prim/prim/vjp/static/test_comp_transpose_grad.py => deprecated/prim/prim/vjp/static/test_comp_transpose_grad_deprecated.py} (100%) delete mode 100644 test/prim/prim/flags/CMakeLists.txt delete mode 100644 test/prim/prim/vjp/static/CMakeLists.txt diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt index 088ce7bb8a959..0d047a8539c11 100644 --- a/test/deprecated/legacy_test/CMakeLists.txt +++ b/test/deprecated/legacy_test/CMakeLists.txt @@ -78,6 +78,7 @@ list(REMOVE_ITEM TEST_OPS test_audio_logmel_feature test_audio_mel_feature) list(REMOVE_ITEM TEST_OPS test_fused_dot_product_attention_op) list(REMOVE_ITEM TEST_OPS test_fused_dot_product_attention_op_static) list(REMOVE_ITEM TEST_OPS test_fuse_dot_product_attention_pass) +list(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass_deprecated) if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) list(REMOVE_ITEM TEST_OPS test_memcpy_op) @@ -577,6 +578,11 @@ if(WITH_DISTRIBUTE) endif() endif() +if((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6)) + py_test_modules(test_fuse_gemm_epilogue_pass_deprecated MODULES + test_fuse_gemm_epilogue_pass_deprecated) +endif() + if(WIN32) py_test_modules( test_feed_data_check_shape_type_deprecated MODULES diff --git a/test/legacy_test/test_fuse_gemm_epilogue_pass.py b/test/deprecated/legacy_test/test_fuse_gemm_epilogue_pass_deprecated.py similarity index 100% rename from test/legacy_test/test_fuse_gemm_epilogue_pass.py rename to test/deprecated/legacy_test/test_fuse_gemm_epilogue_pass_deprecated.py diff --git a/test/prim/prim/flags/test_eager_blacklist_flag.py b/test/deprecated/prim/prim/flags/test_eager_blacklist_flag_deprecated.py similarity index 100% rename from test/prim/prim/flags/test_eager_blacklist_flag.py rename to test/deprecated/prim/prim/flags/test_eager_blacklist_flag_deprecated.py diff --git a/test/deprecated/prim/prim/vjp/static/CMakeLists.txt b/test/deprecated/prim/prim/vjp/static/CMakeLists.txt index 863a484c466f1..1fc0ac6320465 100644 --- a/test/deprecated/prim/prim/vjp/static/CMakeLists.txt +++ b/test/deprecated/prim/prim/vjp/static/CMakeLists.txt @@ -8,3 +8,11 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach() + +set_tests_properties(test_comp_sum_grad_deprecated PROPERTIES TIMEOUT 60) +set_tests_properties(test_comp_tanh_grad PROPERTIES TIMEOUT 60) +set_tests_properties(test_comp_div_grad_deprecated PROPERTIES TIMEOUT 60) +set_tests_properties(test_comp_add_grad_deprecated PROPERTIES TIMEOUT 60) +set_tests_properties(test_comp_sub_grad_deprecated PROPERTIES TIMEOUT 60) +set_tests_properties(test_comp_add_tanh_grad_deprecated PROPERTIES TIMEOUT 60) +set_tests_properties(test_comp_sqrt_grad PROPERTIES TIMEOUT 60) diff --git a/test/prim/prim/vjp/static/test_comp_add_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_add_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_add_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_add_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_add_tanh_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_add_tanh_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_add_tanh_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_add_tanh_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_batch_norm_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_batch_norm_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_batch_norm_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_batch_norm_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_cast_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_cast_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_cast_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_cast_grad_deprecated.py diff --git a/test/deprecated/prim/prim/vjp/static/test_comp_cumprod_grad_deprecated.py b/test/deprecated/prim/prim/vjp/static/test_comp_cumprod_grad_deprecated.py new file mode 100644 index 0000000000000..239df04b9f8b4 --- /dev/null +++ b/test/deprecated/prim/prim/vjp/static/test_comp_cumprod_grad_deprecated.py @@ -0,0 +1,213 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.base import core + +core._set_prim_backward_enabled(True) + +import random + +import numpy as np +import parameterized as param + +import paddle + + +def apply_to_static(net, use_cinn): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static( + net, build_strategy=build_strategy, full_graph=True + ) + + +class PrimeNet(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.fc = paddle.nn.Linear(4, 4) + + def forward(self, x): + tmp = self.fc(x) + out = paddle.cumprod(tmp, -1) + return out + + +@param.parameterized_class( + ('primal', 'cotangent', 'dtype'), + [ + ( + np.random.uniform(1, 5, (50,)), + np.random.uniform(1, 5, (50,)), + np.float32, + ), + (np.random.rand(10, 10), np.random.rand(10, 10), np.float32), + (np.random.rand(3, 4, 5), np.random.rand(3, 4, 5), np.float32), + (np.random.rand(2, 3, 4, 5), np.random.rand(2, 3, 4, 5), np.float32), + ( + np.random.rand(2, 3, 2, 4, 5), + np.random.rand(2, 3, 2, 4, 5), + np.float32, + ), + (np.random.randint(1, 20, (10, 10)), np.random.rand(10, 10), np.int64), + ], +) +class TestCumprodGradComp(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.primal = cls.primal.astype(cls.dtype) + cls.cotangent = cls.cotangent.astype(cls.dtype) + cls.zero_nums = [0, 1, 10, int(np.prod(cls.primal.shape))] + + def train(self, use_prim, use_cinn): + paddle.seed(2022) + self.x = paddle.randn([2, 4]) + self.x.stop_gradient = False + net = PrimeNet() + core._set_prim_backward_enabled(use_prim) + net = apply_to_static(net, use_cinn) + out = net(self.x) + res = paddle.autograd.grad(out, [self.x]) + + return res + + def test_cumprod_grad_comp(self): + paddle.enable_static() + + def actual(primal, cotangent, dim): + core._set_prim_backward_enabled(True) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = False + v = paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + y = paddle.cumprod(x, dim) + x_cotangent = paddle.static.gradients(y, x, v) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=[x_cotangent[0]], + )[0] + + def desired(primal, cotangent, dim): + core._set_prim_backward_enabled(False) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = False + v = paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + y = paddle.cumprod(x, dim) + x_cotangent = paddle.static.gradients(y, x, v) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=[x_cotangent[0]], + )[0] + + for zero_num in self.zero_nums: + shape = self.primal.shape + x = self.primal.flatten() + indices = random.sample(range(x.size), zero_num) + for i in indices: + x[i] = 0 + x = np.reshape(x, shape) + for i in range(len(self.primal.shape)): + np.testing.assert_allclose( + actual=actual(x, self.cotangent, i), + desired=desired(x, self.cotangent, i), + rtol=1e-6, + atol=0, + ) + core._set_prim_backward_enabled(False) + paddle.disable_static() + + +@param.parameterized_class( + ('primal', 'cotangent', 'dtype'), + [ + ( + np.random.uniform(1, 5, ()), + np.random.uniform(1, 5, ()), + np.float32, + ) + ], +) +class TestCumprodGradComp0D(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.primal = cls.primal.astype(cls.dtype) + cls.cotangent = cls.cotangent.astype(cls.dtype) + + def test_cumprod_grad_comp_0d(self): + paddle.enable_static() + + def actual(primal, cotangent, dim): + core._set_prim_backward_enabled(True) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = False + v = paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + y = paddle.cumprod(x, dim) + x_cotangent = paddle.static.gradients(y, x, v) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=[x_cotangent[0]], + )[0] + + def desired(primal, cotangent, dim): + core._set_prim_backward_enabled(False) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x = paddle.static.data('primal', primal.shape, primal.dtype) + x.stop_gradient = False + v = paddle.static.data( + 'cotangent', cotangent.shape, cotangent.dtype + ) + y = paddle.cumprod(x, dim) + x_cotangent = paddle.static.gradients(y, x, v) + exe = paddle.static.Executor() + exe.run(sp) + return exe.run( + program=mp, + feed={'primal': primal, 'cotangent': cotangent}, + fetch_list=[x_cotangent[0]], + )[0] + + np.testing.assert_allclose( + actual=actual(self.primal, self.cotangent, 0), + desired=desired(self.primal, self.cotangent, 0), + rtol=1e-6, + atol=0, + ) + core._set_prim_backward_enabled(False) + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/test/prim/prim/vjp/static/test_comp_div_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_div_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_div_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_div_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_exp_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_exp_grad.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_exp_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_exp_grad.py diff --git a/test/prim/prim/vjp/static/test_comp_expand_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_expand_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_expand_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_expand_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_matmul_double_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_matmul_double_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_matmul_double_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_matmul_double_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_multiply_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_multiply_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_multiply_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_multiply_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_reshape_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_reshape_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_reshape_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_reshape_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_sigmoid_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_sigmoid_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_sigmoid_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_sigmoid_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_sqrt_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_sqrt_grad.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_sqrt_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_sqrt_grad.py diff --git a/test/prim/prim/vjp/static/test_comp_sub_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_sub_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_sub_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_sub_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_sum_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_sum_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_sum_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_sum_grad_deprecated.py diff --git a/test/prim/prim/vjp/static/test_comp_tanh_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_tanh_grad.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_tanh_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_tanh_grad.py diff --git a/test/prim/prim/vjp/static/test_comp_transpose_grad.py b/test/deprecated/prim/prim/vjp/static/test_comp_transpose_grad_deprecated.py similarity index 100% rename from test/prim/prim/vjp/static/test_comp_transpose_grad.py rename to test/deprecated/prim/prim/vjp/static/test_comp_transpose_grad_deprecated.py diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt index 8c1ff123f03ec..5e6a1d2c4550b 100644 --- a/test/dygraph_to_static/CMakeLists.txt +++ b/test/dygraph_to_static/CMakeLists.txt @@ -12,8 +12,6 @@ if(WIN32 AND NOT WITH_GPU) endif() list(REMOVE_ITEM TEST_OPS test_build_strategy) -list(REMOVE_ITEM TEST_OPS test_simnet) -list(REMOVE_ITEM TEST_OPS test_pylayer) if(NOT WITH_GPU) # TODO(SigureMo): Temporarily disable train step on Windows CPU CI. diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 627646a43d56a..e5d3356a44528 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -31,7 +31,6 @@ list(REMOVE_ITEM TEST_OPS test_pool_max_op) list(REMOVE_ITEM TEST_OPS test_matmul_v2_op) list(REMOVE_ITEM TEST_OPS test_allgather) -list(REMOVE_ITEM TEST_OPS test_raw_program_optimizer) list(REMOVE_ITEM TEST_OPS test_reducescatter) list(REMOVE_ITEM TEST_OPS test_flash_attention) @@ -89,7 +88,6 @@ endif() list(REMOVE_ITEM TEST_OPS test_audio_logmel_feature test_audio_mel_feature) list(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op) list(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op) -list(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass) list(REMOVE_ITEM TEST_OPS test_fused_dot_product_attention_op) list(REMOVE_ITEM TEST_OPS test_fused_dot_product_attention_op_static) list(REMOVE_ITEM TEST_OPS test_fuse_dot_product_attention_pass) @@ -445,7 +443,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_lstm_api) list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op) -list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass) list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op) list(REMOVE_ITEM TEST_OPS test_layers) list(REMOVE_ITEM TEST_OPS test_install_check) diff --git a/test/legacy_test/test_fuse_bn_add_act_pass.py b/test/legacy_test/test_fuse_bn_add_act_pass.py index d121a211e7afd..b71ba7206ebca 100644 --- a/test/legacy_test/test_fuse_bn_add_act_pass.py +++ b/test/legacy_test/test_fuse_bn_add_act_pass.py @@ -241,27 +241,29 @@ def check(self, place, use_cuda): self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5) def test_fuse_bn_add_act(self): - place = base.CUDAPlace(0) - self.check(place, use_cuda=True) + with paddle.pir_utils.OldIrGuard(): + place = base.CUDAPlace(0) + self.check(place, use_cuda=True) def test_fuse_bn_add_act_API(self): - # build_fused_program: use fused_bn_add_act python API - main_program = base.Program() - startup_program = base.Program() - place = base.CUDAPlace(0) - x, y, loss = self.build_fused_program( - main_program, startup_program, use_cuda=True - ) - exe = base.Executor(place) - scope = base.Scope() - with base.scope_guard(scope): - exe.run(startup_program) - for _ in range(5): - x = np.random.random((4, 1, 28, 28)).astype("float32") - y = np.random.random((4, 1)).astype("int64") - loss_v = exe.run( - main_program, feed={"x": x, "y": y}, fetch_list=[loss] - ) + with paddle.pir_utils.OldIrGuard(): + # build_fused_program: use fused_bn_add_act python API + main_program = base.Program() + startup_program = base.Program() + place = base.CUDAPlace(0) + x, y, loss = self.build_fused_program( + main_program, startup_program, use_cuda=True + ) + exe = base.Executor(place) + scope = base.Scope() + with base.scope_guard(scope): + exe.run(startup_program) + for _ in range(5): + x = np.random.random((4, 1, 28, 28)).astype("float32") + y = np.random.random((4, 1)).astype("int64") + loss_v = exe.run( + main_program, feed={"x": x, "y": y}, fetch_list=[loss] + ) if __name__ == '__main__': diff --git a/test/legacy_test/test_raw_program_optimizer.py b/test/legacy_test/test_raw_program_optimizer.py index 362970cdf2d3f..d8647e49c1442 100644 --- a/test/legacy_test/test_raw_program_optimizer.py +++ b/test/legacy_test/test_raw_program_optimizer.py @@ -47,31 +47,32 @@ def gen_data(self): def test_single_gpu(self): paddle.enable_static() - fleet.init(is_collective=True) - sharding_program = paddle.static.Program() - sharding_startup_program = paddle.static.Program() - strategy = fleet.DistributedStrategy() - strategy.without_graph_optimization = True - with base.program_guard(sharding_program, sharding_startup_program): - with base.unique_name.guard(): - input_x = paddle.static.data( - name="x", shape=[None, 32], dtype='float32' - ) - input_y = paddle.static.data( - name="y", shape=[None, 1], dtype='int64' - ) - cost = self.mlp(input_x=input_x, input_y=input_y) - output_name = cost.name - optimizer = fleet.distributed_optimizer( - paddle.optimizer.Adam(), strategy - ) - optimizer.minimize(cost) + with paddle.pir_utils.OldIrGuard(): + fleet.init(is_collective=True) + sharding_program = paddle.static.Program() + sharding_startup_program = paddle.static.Program() + strategy = fleet.DistributedStrategy() + strategy.without_graph_optimization = True + with base.program_guard(sharding_program, sharding_startup_program): + with base.unique_name.guard(): + input_x = paddle.static.data( + name="x", shape=[None, 32], dtype='float32' + ) + input_y = paddle.static.data( + name="y", shape=[None, 1], dtype='int64' + ) + cost = self.mlp(input_x=input_x, input_y=input_y) + output_name = cost.name + optimizer = fleet.distributed_optimizer( + paddle.optimizer.Adam(), strategy + ) + optimizer.minimize(cost) - trainer_id = fleet.worker_index() - exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id)) - rank = fleet.worker_index() - exe.run(sharding_startup_program) - exe.run(program=sharding_program, feed=self.gen_data()) + trainer_id = fleet.worker_index() + exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id)) + rank = fleet.worker_index() + exe.run(sharding_startup_program) + exe.run(program=sharding_program, feed=self.gen_data()) if __name__ == "__main__": diff --git a/test/prim/prim/CMakeLists.txt b/test/prim/prim/CMakeLists.txt index 80c5c8fe1538f..db4822bce3f91 100644 --- a/test/prim/prim/CMakeLists.txt +++ b/test/prim/prim/CMakeLists.txt @@ -9,4 +9,3 @@ foreach(TEST_OP ${TEST_OPS}) endforeach() add_subdirectory(vjp) -add_subdirectory(flags) diff --git a/test/prim/prim/flags/CMakeLists.txt b/test/prim/prim/flags/CMakeLists.txt deleted file mode 100644 index 72c6bbd7d05e8..0000000000000 --- a/test/prim/prim/flags/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -file( - GLOB TEST_OPS - RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" - "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -foreach(TEST_OP ${TEST_OPS}) - py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) -endforeach() diff --git a/test/prim/prim/vjp/CMakeLists.txt b/test/prim/prim/vjp/CMakeLists.txt index 84084b0b4d956..df40c0f39e976 100644 --- a/test/prim/prim/vjp/CMakeLists.txt +++ b/test/prim/prim/vjp/CMakeLists.txt @@ -11,4 +11,3 @@ endforeach() set_tests_properties(test_comp_high_grad PROPERTIES TIMEOUT 100) add_subdirectory(eager) -add_subdirectory(static) diff --git a/test/prim/prim/vjp/static/CMakeLists.txt b/test/prim/prim/vjp/static/CMakeLists.txt deleted file mode 100644 index a29e094a17f05..0000000000000 --- a/test/prim/prim/vjp/static/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -file( - GLOB TEST_OPS - RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" - "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) - -foreach(TEST_OP ${TEST_OPS}) - py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) -endforeach() - -set_tests_properties(test_comp_sum_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_tanh_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_div_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_add_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_sub_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_add_tanh_grad PROPERTIES TIMEOUT 60) -set_tests_properties(test_comp_sqrt_grad PROPERTIES TIMEOUT 60) diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 55d3ec25d26c5..8c46b0e49dc38 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -62,7 +62,7 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\ ^test_mul_op$|\ ^test_bmn$|\ ^test_memory_efficient_attention$|\ -^test_fuse_gemm_epilogue_pass$|\ +^test_fuse_gemm_epilogue_pass_deprecated$|\ ^test_tril_triu_op$|\ ^test_train_step_resnet18_adam$|\ ^test_train_step_resnet18_sgd$|\ From a98d2bfea652c7d5dfc3b96d356cd44f94e5c421 Mon Sep 17 00:00:00 2001 From: co63oc Date: Sun, 29 Sep 2024 10:11:19 +0800 Subject: [PATCH 006/135] Add sync_comm_stream double type [fluid_ops] (#68496) --- paddle/phi/kernels/gpu/sync_comm_stream_kernel.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/sync_comm_stream_kernel.cu b/paddle/phi/kernels/gpu/sync_comm_stream_kernel.cu index 5a8fdb9dde1db..b0ff9f55de34b 100644 --- a/paddle/phi/kernels/gpu/sync_comm_stream_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_comm_stream_kernel.cu @@ -14,5 +14,9 @@ #include "paddle/phi/kernels/impl/sync_comm_stream_kernel_impl.h" -PD_REGISTER_KERNEL( - sync_comm_stream, GPU, ALL_LAYOUT, phi::SyncCommStreamKernel, float) {} +PD_REGISTER_KERNEL(sync_comm_stream, + GPU, + ALL_LAYOUT, + phi::SyncCommStreamKernel, + float, + double) {} From 0056291512c485e84b668c56a910a599daac3b3f Mon Sep 17 00:00:00 2001 From: Whsjrczr <123729598+Whsjrczr@users.noreply.github.com> Date: Sun, 29 Sep 2024 10:51:44 +0800 Subject: [PATCH 007/135] =?UTF-8?q?=E3=80=90Infer=20Symbolic=20Shape=20No.?= =?UTF-8?q?125=E3=80=91=E3=80=90BUAA=E3=80=91Add=20BroadcastTensor,=20chan?= =?UTF-8?q?ged=203=20files=20(#68180)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add broadcast tensor * rerun * size_t -> int * add shape * delete size_t * size_t --- .../multiary_infer_sym.cc | 41 ++++++++++++++++--- .../infer_symbolic_shape/multiary_infer_sym.h | 2 +- paddle/phi/ops/yaml/ops.yaml | 1 + 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 273e22db6f1de..5a254dd84b2a0 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -652,12 +652,41 @@ bool BilinearOpInferSymbolicShape( // return true; // } -// bool BroadcastTensorsOpInferSymbolicShape(pir::Operation *op, -// pir::InferSymbolicShapeContext -// *infer_context) { -// // pass -// return true; -// } +bool BroadcastTensorsOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &input_shape_or_data_list = + infer_context->GetShapeOrDataForValue(op->operand_source(0)) + .dyn_cast(); + // 1. Find Output rank = max(Inputs rank) + int target_rank = 0; + for (const auto &input_shape_or_data : input_shape_or_data_list) { + int tmp_rank = input_shape_or_data.shape().size(); + target_rank = std::max(target_rank, tmp_rank); + } + // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) + std::vector out_shape; + symbol::DimExprBuilder builder; + for (int i = 0; i < target_rank; i++) { + auto tmp_dim = symbol::DimExpr{1}; + for (const auto &input_shape_or_data : input_shape_or_data_list) { + int axis = input_shape_or_data.shape().size(); + axis = i - target_rank + axis; + if (axis >= 0) { + infer_context->AddBroadcastableCstr(input_shape_or_data.shape()[axis], + tmp_dim); + tmp_dim = builder.Broadcast(input_shape_or_data.shape()[axis], tmp_dim); + } + } + out_shape.emplace_back(tmp_dim); + } + symbol::TensorListShapeOrDataDimExprs out_shapes; + for (size_t i = 0; i < input_shape_or_data_list.size(); i++) { + out_shapes.emplace_back(out_shape); + } + infer_context->SetShapeOrDataForValue( + op->result(0), symbol::ShapeOrDataDimExprs{out_shapes}); + return true; +} bool BilinearInterpOpInferSymbolicShape( pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h index 1a778a9424163..a67cf5cbf1551 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h @@ -24,7 +24,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Addmm_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(AddN) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Auc) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(AssignPos) -// OP_DECLARE_INFER_SYMBOLIC_SHAPE(BroadcastTensors) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(BroadcastTensors) OP_DECLARE_INFER_SYMBOLIC_SHAPE(BatchFc) OP_DECLARE_INFER_SYMBOLIC_SHAPE(BatchNorm) OP_DECLARE_INFER_SYMBOLIC_SHAPE(BatchNorm_) diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 0d9e146b400a7..1c45c5009e67c 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -684,6 +684,7 @@ func: broadcast_tensors data_type : input backward: broadcast_tensors_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface - op : c_allgather args : (Tensor x, int ring_id, int nranks, bool use_calc_stream) From 26d2d99b6ba196706522f5c3adba8adb8371eb13 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Sun, 29 Sep 2024 11:12:47 +0800 Subject: [PATCH 008/135] [Prim] Mark `reduce_as` no need `target` buffer and use `by_pass` in both dynamic and static shape branch (#68491) --- paddle/common/ddim.cc | 12 +++++++++ paddle/common/ddim.h | 3 +++ paddle/fluid/primitive/rule/vjp/details.h | 30 +++++++++++------------ paddle/phi/ops/yaml/ops.yaml | 1 + 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/paddle/common/ddim.cc b/paddle/common/ddim.cc index efba5250a268f..4ef89f9f396c5 100644 --- a/paddle/common/ddim.cc +++ b/paddle/common/ddim.cc @@ -282,6 +282,18 @@ DDim ComputeCompatibleDim(const DDim& dim1, const DDim& dim2) { return make_ddim(result); } +bool AreDimsWithDynamicShapeCompatible(const DDim& dim1, const DDim& dim2) { + if (dim1.size() != dim2.size()) { + return false; + } + for (int i = 0; i < dim1.size(); ++i) { + if (dim1[i] >= 0 && dim2[i] >= 0 && dim1[i] != dim2[i]) { + return false; + } + } + return true; +} + } // namespace common namespace std { diff --git a/paddle/common/ddim.h b/paddle/common/ddim.h index 88268f6d8f056..831635f36a7b9 100644 --- a/paddle/common/ddim.h +++ b/paddle/common/ddim.h @@ -234,6 +234,9 @@ TEST_API DDim stride(const DDim& ddim); TEST_API DDim stride_numel(const DDim& ddim); +TEST_API bool AreDimsWithDynamicShapeCompatible(const DDim& dim1, + const DDim& dim2); + TEST_API DDim ComputeCompatibleDim(const DDim& dim1, const DDim& dim2); } // namespace common diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index 69e5a898c596f..e54371fbac5f3 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -20,6 +20,7 @@ #include #include +#include "paddle/common/ddim.h" #include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h" #include "paddle/fluid/primitive/type/lazy_tensor.h" #include "paddle/fluid/primitive/utils/utils.h" @@ -655,37 +656,36 @@ void add_grad(const Tensor& x, Tensor* dx, Tensor* dy) { if (dy) { - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { - auto dy_tmp = reduce_as(out_grad, y); - set_output(dy_tmp, dy); - } else { - if (out_grad.dims() != y.dims()) { + if (!common::AreDimsWithDynamicShapeCompatible(out_grad.dims(), y.dims())) { + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + auto dy_tmp = reduce_as(out_grad, y); + set_output(dy_tmp, dy); + } else { phi::DDim reduce_dim = get_reduce_dims_from_out(out_grad.dims(), y.dims()); auto dy_reduce_res = out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false); auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); set_output(dy_tmp, dy); - - } else { - by_pass(out_grad, dy); } + } else { + by_pass(out_grad, dy); } } if (dx) { - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { - auto dx_tmp = reduce_as(out_grad, x); - set_output(dx_tmp, dx); - } else { - if (out_grad.dims() != x.dims()) { + if (!common::AreDimsWithDynamicShapeCompatible(out_grad.dims(), x.dims())) { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + auto dx_tmp = reduce_as(out_grad, x); + set_output(dx_tmp, dx); + } else { auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); auto dx_reduce_res = out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false); auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); set_output(dx_tmp, dx); - } else { - by_pass(out_grad, dx); } + } else { + by_pass(out_grad, dx); } } } diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 1c45c5009e67c..7b2239715bc11 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -3883,6 +3883,7 @@ func : reduce_as data_type : x backward : reduce_as_grad + no_need_buffer : target interfaces : paddle::dialect::InferSymbolicShapeInterface - op : reduce_scatter From fadf686795e50da13b97c4cc0b2f7556c964403b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BF=83=E5=AE=B8?= <118902573+tlxd@users.noreply.github.com> Date: Sun, 29 Sep 2024 14:19:07 +0800 Subject: [PATCH 009/135] [CINN] add `triangular_solve` op in `binary_infer_sym.cc` (#68022) * Update ops.yaml binary_infer_sym.cc binary_infer_sym.h * Update binary_infer_sym.cc --- .../infer_symbolic_shape/binary_infer_sym.cc | 70 +++++++++++++++++-- .../infer_symbolic_shape/binary_infer_sym.h | 2 +- paddle/phi/ops/yaml/ops.yaml | 2 +- 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc index c1339db429384..24ea5653a23e1 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc @@ -1781,12 +1781,70 @@ bool TdmChildOpInferSymbolicShape( return true; } -// bool TriangularSolveOpInferSymbolicShape(pir::Operation *op, -// pir::InferSymbolicShapeContext -// *infer_context) { -// // pass -// return true; -// } +static inline std::vector MatrixGetBroadcastBatchPortion( + const std::vector &x, + const std::vector &y, + pir::InferSymbolicShapeContext *infer_context) { + // use int to avoid underflow for minus + int size_x = x.size(); + int size_y = y.size(); + int max_size = std::max(size_x, size_y); + std::vector batchPortion(max_size); + + int size_diff = size_x - size_y; + if (size_diff > 0) { + for (int i = 0; i < size_diff; i++) { + batchPortion[i] = x[i]; + } + } else { + size_diff = -size_diff; + for (int i = 0; i < size_diff; i++) { + batchPortion[i] = y[i]; + } + } + + symbol::DimExprBuilder builder; + for (int i = size_diff; i < max_size; i++) { + int offset = max_size - i; + int dim_x = size_x - offset; + int dim_y = size_y - offset; + infer_context->AddBroadcastableCstr(x[dim_x], y[dim_y]); + batchPortion[i] = builder.Broadcast(x[dim_x], y[dim_y]); + } + return batchPortion; +} + +bool TriangularSolveOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &x_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + const std::vector &x_shape = x_shape_or_data.shape(); + const auto &y_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(1)); + const std::vector &y_shape = y_shape_or_data.shape(); + + const auto &x_rank = x_shape.size(); + const auto &y_rank = y_shape.size(); + + infer_context->AddEqualCstr(x_shape[x_rank - 2], x_shape[x_rank - 1]); + + std::vector x_shape_cut(x_shape.begin(), x_shape.end() - 2); + std::vector y_shape_cut(y_shape.begin(), y_shape.end() - 2); + + std::vector expand_batch_portion = + MatrixGetBroadcastBatchPortion(x_shape_cut, y_shape_cut, infer_context); + + std::vector output_shape({expand_batch_portion}); + output_shape.insert(output_shape.end(), + {y_shape[y_rank - 2], y_shape[y_rank - 1]}); + + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(output_shape)}); + + return true; +} bool Unpool3dOpInferSymbolicShape( pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h index 3401ff9db122c..9ab5332b09409 100755 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h @@ -91,7 +91,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Swiglu) OP_DECLARE_INFER_SYMBOLIC_SHAPE(TakeAlongAxis) OP_DECLARE_INFER_SYMBOLIC_SHAPE(TopPSampling) OP_DECLARE_INFER_SYMBOLIC_SHAPE(TdmChild) -// OP_DECLARE_INFER_SYMBOLIC_SHAPE(TriangularSolve) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(TriangularSolve) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Unpool3d) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Unpool) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(WeightDequantize) diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 7b2239715bc11..916736670c39b 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -4854,7 +4854,7 @@ func : triangular_solve data_type : x backward : triangular_solve_grad - # interfaces : paddle::dialect::InferSymbolicShapeInterface + interfaces : paddle::dialect::InferSymbolicShapeInterface - op : tril args : (Tensor x, int diagonal) From 591d021d499186b24c34137eee5a2886a6ab71dd Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Sun, 29 Sep 2024 14:26:17 +0800 Subject: [PATCH 010/135] [AutoParallel] Revise GlobalToSub Reshard Rule (#68468) * update unitest * align param_grad order * bugfix for optimizer cases * unitest * bugfix * fixed unitest * fixed * update engine * pir unshard tensor * trigger CI * remove print * bugfix * bugfix * update unitest cmake * update unitest cmake * update reshard func * enable unitest --------- Co-authored-by: winter-wang <1030748926@qq.com> --- .../reshard_funcs/global_to_sub_mesh_func.py | 34 +++++++++++++++++-- ...test_semi_auto_parallel_llama_model_amp.py | 3 -- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/global_to_sub_mesh_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/global_to_sub_mesh_func.py index 764df077af62e..3d9601ed97f8f 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/global_to_sub_mesh_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/global_to_sub_mesh_func.py @@ -14,12 +14,19 @@ import paddle -from .base_reshard_func import ReshardFunction +from .base_reshard_func import ( + ReshardFunction, + is_replicated, +) +from .nd_mesh_reshard_func import NdMeshReshardFunction class GlobaleToSubMeshFunction(ReshardFunction): def is_suitable(self, src_dist_attr, dst_dist_attr): - if 0 in src_dist_attr.dims_mapping or 0 in src_dist_attr.partial_status: + + # NOTE we could allow the src_dist_attr is not replicated and reshard it as replicated before go through the global_to_sub logic + # but the dst_dist_attr should be replicated otherwise there will be un-defined result when change the mesh. + if not is_replicated(dst_dist_attr): return False in_mesh = src_dist_attr.process_mesh out_mesh = dst_dist_attr.process_mesh @@ -32,6 +39,29 @@ def is_suitable(self, src_dist_attr, dst_dist_attr): return out_mesh in sub_meshes def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): + + # reshard operand as replicated before change the mesh. + if not is_replicated(src_dist_attr): + tmp_dist_attr = ( + paddle.base.libpaddle.pir.create_tensor_dist_attribute( + src_dist_attr.process_mesh, + [-1] * len(src_dist_attr.dims_mapping), + {}, + ) + ) + tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + src_value.type(), tmp_dist_attr + ) + + pre_reshard_func = NdMeshReshardFunction() + src_value = pre_reshard_func.reshard( + src_dist_attr, + tmp_dist_attr, + src_value, + tmp_dst_type, + ) + src_dist_attr = tmp_dist_attr + if src_value.has_one_use(): src_value.update_dist_attr(dst_dist_attr) prev_op = src_value.get_defining_op() diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model_amp.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model_amp.py index d0405857902a3..3f06b82cbc6db 100644 --- a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model_amp.py +++ b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model_amp.py @@ -12,13 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import unittest import collective.test_communication_api_base as test_base -os.environ['FLAGS_enable_pir_api'] = '0' - class TestSemiAutoParallelLlama3DAMPTest(test_base.CommunicationTestDistBase): def setUp(self): From ad2dcab799e119060499dc1caa329a31641ad594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B5=B7=E6=B6=9B?= <76113733+successfulbarrier@users.noreply.github.com> Date: Sun, 29 Sep 2024 14:50:24 +0800 Subject: [PATCH 011/135] add illustration for chunk API (#66503) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test * 添加chunk图例 * 删除测试 * pre-commit完善 * Update python/paddle/tensor/manipulation.py --------- Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com> --- python/paddle/tensor/manipulation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 2b790383b3696..ed6390727b8a0 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -4400,6 +4400,19 @@ def chunk( """ Split the input tensor into multiple sub-Tensors. + Here are some examples to explain it. + + - 1. Given a 3-D tensor x with a shape [3, 3, 3], if we split the first dimension into three equal parts, it will output a list containing three 3-D tensors with a shape of [1, 3, 3]. + - 2. Given a 3-D tensor x with a shape [3, 3, 3], if we split the second dimension into three equal parts, it will output a list containing three 3-D tensors with a shape of [3, 1, 3]. + - 3. Given a 3-D tensor x with a shape [3, 3, 3], if we split the third dimension into three equal parts, it will output a list containing three 3-D tensors with a shape of [3, 3, 1]. + + The following figure illustrates the first example. + + .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/images/chunk.png + :width: 800 + :alt: legend of reshape API + :align: center + Args: x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64. chunks(int): The number of tensor to be split along the certain axis. From 7fb1eacb0b1751aa04c525c01350009b87cc8c2c Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Sun, 29 Sep 2024 14:52:29 +0800 Subject: [PATCH 012/135] [Typing] change typing in paddle/audio/functional/window.py (#68486) --- python/paddle/audio/functional/window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py index 5ea552fc5e4c6..22197ec192b44 100644 --- a/python/paddle/audio/functional/window.py +++ b/python/paddle/audio/functional/window.py @@ -108,7 +108,7 @@ def _general_gaussian( @window_function_register.register() def _general_cosine( - M: int, a: float, sym: bool = True, dtype: str = 'float64' + M: int, a: list[float], sym: bool = True, dtype: str = 'float64' ) -> Tensor: """Compute a generic weighted sum of cosine terms window. This function is consistent with scipy.signal.windows.general_cosine(). From 0fb6f12f7a151e0a5e2e58678f1bbd189d71376a Mon Sep 17 00:00:00 2001 From: Jinyan Chen Date: Sun, 29 Sep 2024 19:20:23 +0800 Subject: [PATCH 013/135] support autocast bf16 for intel_hpu (#68445) * support autocast bf16 for intel_hpu * code style --- python/paddle/amp/auto_cast.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 7080d329f6190..e222f5fec2f69 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -232,7 +232,10 @@ def _is_custom_device_bfloat16_supported() -> bool: Judge whether current custom device support bfloat16 amp. """ place = _current_expected_place() - return place.get_device_type() == 'npu' + return ( + place.get_device_type() == 'npu' + or place.get_device_type() == 'intel_hpu' + ) def need_keep_fp32(layer: Layer, dtype: str) -> bool: From 586f15c5c921d9129c25aec64c432a1d1106b00e Mon Sep 17 00:00:00 2001 From: Nana <49900969+NKNaN@users.noreply.github.com> Date: Sun, 29 Sep 2024 19:58:34 +0800 Subject: [PATCH 014/135] =?UTF-8?q?=E3=80=90Hackathon=207th=20No.18?= =?UTF-8?q?=E3=80=91=E4=B8=BA=E7=A8=80=E7=96=8F=E8=AE=A1=E7=AE=97=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E5=A4=8D=E6=95=B0=E6=94=AF=E6=8C=812=20-part(#68432)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sparse/cpu/elementwise_grad_kernel.cc | 161 +++++++++++++++--- .../kernels/sparse/cpu/elementwise_kernel.cc | 16 +- python/paddle/sparse/binary.py | 16 +- .../legacy_test/test_sparse_elementwise_op.py | 65 +++++-- 4 files changed, 210 insertions(+), 48 deletions(-) diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc index 32e4b28ba9324..bc9b67073bdc8 100644 --- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/activation_kernel.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -160,6 +161,32 @@ void CopyCsrValues(const Context& dev_ctx, } } +template +void ConjugateCsrValues(const Context& dev_ctx, + const SparseCsrTensor& x, + SparseCsrTensor* x_conj) { + AllocCsrPtr(dev_ctx, x, x_conj); + CopyCsrValues(dev_ctx, x, x, x_conj); + DenseTensor x_conj_values = x_conj->values(); + x_conj_values = phi::Conj(dev_ctx, x_conj_values); + DenseTensor x_conj_crows = x_conj->crows(); + DenseTensor x_conj_cols = x_conj->cols(); + x_conj->SetMember(x_conj_crows, x_conj_cols, x_conj_values, x_conj->dims()); +} + +template +void ConjugateCooValues(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* x_conj) { + AllocCooPtr(dev_ctx, x, x_conj); + CopyCooValues(dev_ctx, x, x, x_conj); + DenseTensor x_conj_values = x_conj->values(); + x_conj_values = phi::Conj(dev_ctx, x_conj_values); + DenseTensor x_conj_indices = x_conj->indices(); + x_conj->SetMember( + x_conj_indices, x_conj_values, x_conj->dims(), x_conj->coalesced()); +} + template void ElementWiseAddCsrGradCPUKernel(const Context& dev_ctx, const SparseCsrTensor& x, @@ -212,20 +239,40 @@ void ElementWiseMultiplyCsrGradCPUKernel(const Context& dev_ctx, SparseCsrTensor* dx, SparseCsrTensor* dy) { if (dx) { - // dout*y AllocCsrPtr(dev_ctx, x, dx); SparseCsrTensor tmp_dx; AllocCsrPtr(dev_ctx, x, &tmp_dx); - sparse::ElementWiseMultiplyCsrKernel(dev_ctx, dout, y, &tmp_dx); + if (std::is_same>::value || + std::is_same>::value) { + // dout*y_conj + SparseCsrTensor y_conj; + ConjugateCsrValues(dev_ctx, y, &y_conj); + sparse::ElementWiseMultiplyCsrKernel( + dev_ctx, dout, y_conj, &tmp_dx); + } else { + // dout*y + sparse::ElementWiseMultiplyCsrKernel( + dev_ctx, dout, y, &tmp_dx); + } CopyCsrValues(dev_ctx, tmp_dx, x, dx); } if (dy) { - // dout*x AllocCsrPtr(dev_ctx, y, dy); SparseCsrTensor tmp_dy; AllocCsrPtr(dev_ctx, y, &tmp_dy); - sparse::ElementWiseMultiplyCsrKernel(dev_ctx, dout, x, &tmp_dy); + if (std::is_same>::value || + std::is_same>::value) { + // dout*x_conj + SparseCsrTensor x_conj; + ConjugateCsrValues(dev_ctx, x, &x_conj); + sparse::ElementWiseMultiplyCsrKernel( + dev_ctx, dout, x_conj, &tmp_dy); + } else { + // dout*x + sparse::ElementWiseMultiplyCsrKernel( + dev_ctx, dout, x, &tmp_dy); + } CopyCsrValues(dev_ctx, tmp_dy, y, dy); } } @@ -239,11 +286,20 @@ void ElementWiseDivideCsrGradCPUKernel(const Context& dev_ctx, SparseCsrTensor* dx, SparseCsrTensor* dy) { if (dx) { - // dout/y AllocCsrPtr(dev_ctx, x, dx); SparseCsrTensor tmp_dx; AllocCsrPtr(dev_ctx, x, &tmp_dx); - sparse::ElementWiseDivideCsrKernel(dev_ctx, dout, y, &tmp_dx); + if (std::is_same>::value || + std::is_same>::value) { + // dout/y_conj + SparseCsrTensor y_conj; + ConjugateCsrValues(dev_ctx, y, &y_conj); + sparse::ElementWiseDivideCsrKernel( + dev_ctx, dout, y_conj, &tmp_dx); + } else { + // dout/y + sparse::ElementWiseDivideCsrKernel(dev_ctx, dout, y, &tmp_dx); + } CopyCsrValues(dev_ctx, tmp_dx, x, dx); } @@ -256,8 +312,22 @@ void ElementWiseDivideCsrGradCPUKernel(const Context& dev_ctx, Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, &tmp_dy); phi::NegativeKernel( dev_ctx, dout.values(), tmp_dy.mutable_values()); - auto tmp = sparse::ElementWiseMultiplyCsr(dev_ctx, tmp_dy, out); - sparse::ElementWiseDivideCsrKernel(dev_ctx, tmp, y, &tmp_dy); + if (std::is_same>::value || + std::is_same>::value) { + // -dout * (out / y)_conj = -dout * out_conj / y_conj + SparseCsrTensor out_conj; + ConjugateCsrValues(dev_ctx, out, &out_conj); + SparseCsrTensor y_conj; + ConjugateCsrValues(dev_ctx, y, &y_conj); + auto tmp = + sparse::ElementWiseMultiplyCsr(dev_ctx, tmp_dy, out_conj); + sparse::ElementWiseDivideCsrKernel( + dev_ctx, tmp, y_conj, &tmp_dy); + } else { + auto tmp = + sparse::ElementWiseMultiplyCsr(dev_ctx, tmp_dy, out); + sparse::ElementWiseDivideCsrKernel(dev_ctx, tmp, y, &tmp_dy); + } CopyCsrValues(dev_ctx, tmp_dy, y, dy); } } @@ -314,20 +384,40 @@ void ElementWiseMultiplyCooGradCPUKernel(const Context& dev_ctx, SparseCooTensor* dx, SparseCooTensor* dy) { if (dx) { - // dout*y AllocCooPtr(dev_ctx, x, dx); SparseCooTensor tmp_dx; AllocCooPtr(dev_ctx, x, &tmp_dx); - sparse::ElementWiseMultiplyCooKernel(dev_ctx, dout, y, &tmp_dx); + if (std::is_same>::value || + std::is_same>::value) { + // dout*y_conj + SparseCooTensor y_conj; + ConjugateCooValues(dev_ctx, y, &y_conj); + sparse::ElementWiseMultiplyCooKernel( + dev_ctx, dout, y_conj, &tmp_dx); + } else { + // dout*y + sparse::ElementWiseMultiplyCooKernel( + dev_ctx, dout, y, &tmp_dx); + } CopyCooValues(dev_ctx, tmp_dx, x, dx); } if (dy) { - // dout*x AllocCooPtr(dev_ctx, y, dy); SparseCooTensor tmp_dy; AllocCooPtr(dev_ctx, y, &tmp_dy); - sparse::ElementWiseMultiplyCooKernel(dev_ctx, dout, x, &tmp_dy); + if (std::is_same>::value || + std::is_same>::value) { + // dout*x_conj + SparseCooTensor x_conj; + ConjugateCooValues(dev_ctx, x, &x_conj); + sparse::ElementWiseMultiplyCooKernel( + dev_ctx, dout, x_conj, &tmp_dy); + } else { + // dout*x + sparse::ElementWiseMultiplyCooKernel( + dev_ctx, dout, x, &tmp_dy); + } CopyCooValues(dev_ctx, tmp_dy, y, dy); } } @@ -341,11 +431,20 @@ void ElementWiseDivideCooGradCPUKernel(const Context& dev_ctx, SparseCooTensor* dx, SparseCooTensor* dy) { if (dx) { - // dout/y AllocCooPtr(dev_ctx, x, dx); SparseCooTensor tmp_dx; AllocCooPtr(dev_ctx, x, &tmp_dx); - sparse::ElementWiseDivideCooKernel(dev_ctx, dout, y, &tmp_dx); + if (std::is_same>::value || + std::is_same>::value) { + // dout/y_conj + SparseCooTensor y_conj; + ConjugateCooValues(dev_ctx, y, &y_conj); + sparse::ElementWiseDivideCooKernel( + dev_ctx, dout, y_conj, &tmp_dx); + } else { + // dout/y + sparse::ElementWiseDivideCooKernel(dev_ctx, dout, y, &tmp_dx); + } CopyCooValues(dev_ctx, tmp_dx, x, dx); } @@ -357,8 +456,22 @@ void ElementWiseDivideCooGradCPUKernel(const Context& dev_ctx, Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, &tmp_dy); phi::NegativeKernel( dev_ctx, dout.values(), tmp_dy.mutable_values()); - auto tmp = sparse::ElementWiseMultiplyCoo(dev_ctx, tmp_dy, out); - sparse::ElementWiseDivideCooKernel(dev_ctx, tmp, y, &tmp_dy); + if (std::is_same>::value || + std::is_same>::value) { + // -dout * (out / y)_conj = -dout * out_conj / y_conj + SparseCooTensor out_conj; + ConjugateCooValues(dev_ctx, out, &out_conj); + SparseCooTensor y_conj; + ConjugateCooValues(dev_ctx, y, &y_conj); + auto tmp = + sparse::ElementWiseMultiplyCoo(dev_ctx, tmp_dy, out_conj); + sparse::ElementWiseDivideCooKernel( + dev_ctx, tmp, y_conj, &tmp_dy); + } else { + auto tmp = + sparse::ElementWiseMultiplyCoo(dev_ctx, tmp_dy, out); + sparse::ElementWiseDivideCooKernel(dev_ctx, tmp, y, &tmp_dy); + } CopyCooValues(dev_ctx, tmp_dy, y, dy); } } @@ -473,7 +586,9 @@ PD_REGISTER_KERNEL(multiply_csr_csr_grad, double, int16_t, int, - int64_t) { + int64_t, + phi::dtype::complex, + phi::dtype::complex) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR); kernel->InputAt(2).SetDataLayout(phi::DataLayout::SPARSE_CSR); @@ -487,7 +602,9 @@ PD_REGISTER_KERNEL(divide_csr_csr_grad, double, int16_t, int, - int64_t) { + int64_t, + phi::dtype::complex, + phi::dtype::complex) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR); kernel->InputAt(2).SetDataLayout(phi::DataLayout::SPARSE_CSR); @@ -534,7 +651,9 @@ PD_REGISTER_KERNEL(multiply_coo_coo_grad, double, int16_t, int, - int64_t) { + int64_t, + phi::dtype::complex, + phi::dtype::complex) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); kernel->InputAt(2).SetDataLayout(phi::DataLayout::SPARSE_COO); @@ -548,7 +667,9 @@ PD_REGISTER_KERNEL(divide_coo_coo_grad, double, int16_t, int, - int64_t) { + int64_t, + phi::dtype::complex, + phi::dtype::complex) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); kernel->InputAt(2).SetDataLayout(phi::DataLayout::SPARSE_COO); diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc index c18349f4f25ab..1ac45ebea449a 100644 --- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc @@ -401,7 +401,9 @@ PD_REGISTER_KERNEL(multiply_csr_csr, double, int16_t, int, - int64_t) { + int64_t, + complex64, + complex128) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR); } @@ -414,7 +416,9 @@ PD_REGISTER_KERNEL(multiply_coo_coo, double, int16_t, int, - int64_t) { + int64_t, + complex64, + complex128) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } @@ -427,7 +431,9 @@ PD_REGISTER_KERNEL(divide_csr_csr, double, int16_t, int, - int64_t) { + int64_t, + complex64, + complex128) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR); } @@ -440,7 +446,9 @@ PD_REGISTER_KERNEL(divide_coo_coo, double, int16_t, int, - int64_t) { + int64_t, + complex64, + complex128) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } diff --git a/python/paddle/sparse/binary.py b/python/paddle/sparse/binary.py index bcf605a4e3114..1f085074a20a6 100644 --- a/python/paddle/sparse/binary.py +++ b/python/paddle/sparse/binary.py @@ -274,8 +274,8 @@ def add(x: Tensor, y: Tensor, name: str | None = None) -> Tensor: out = x + y Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -325,8 +325,8 @@ def subtract(x: Tensor, y: Tensor, name: str | None = None) -> Tensor: out = x - y Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -373,8 +373,8 @@ def multiply(x: Tensor, y: Tensor, name: str | None = None) -> Tensor: out = x * y Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -424,8 +424,8 @@ def divide(x: Tensor, y: Tensor, name: str | None = None) -> Tensor: out = x / y Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64, complex64, complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: diff --git a/test/legacy_test/test_sparse_elementwise_op.py b/test/legacy_test/test_sparse_elementwise_op.py index 917eeded5038c..b72cef180b02a 100644 --- a/test/legacy_test/test_sparse_elementwise_op.py +++ b/test/legacy_test/test_sparse_elementwise_op.py @@ -21,7 +21,6 @@ from paddle.base.framework import in_pir_mode op_list = [__add__, __sub__, __mul__, __truediv__] -op_list_complex = [__add__, __sub__] def get_actual_res(x, y, op): @@ -226,7 +225,7 @@ def test_add_bias(self): class TestSparseElementWiseAPIComplex(unittest.TestCase): def setUp(self): np.random.seed(2022) - self.op_list = op_list_complex + self.op_list = op_list self.csr_shape = [8, 10] self.coo_shape = [3, 7, 2, 9] self.support_dtypes = ['complex64', 'complex128'] @@ -738,19 +737,36 @@ def setUp(self): np.random.seed(2022) self.op_list = op_list self.coo_shape = [4, 8, 3, 5] - self.support_dtypes = ['float32', 'float64', 'int32', 'int64'] + self.support_dtypes = [ + 'float32', + 'float64', + 'int32', + 'int64', + 'complex64', + 'complex128', + ] def test_coo(self): if in_pir_mode(): sparse_dim = len(self.coo_shape) - 1 op = __mul__ for dtype in self.support_dtypes: - x = np.random.randint(-255, 255, size=self.coo_shape).astype( - dtype - ) - y = np.random.randint(-255, 255, size=self.coo_shape).astype( - dtype - ) + if 'complex' in dtype: + x = np.vectorize(complex)( + np.random.randint(-255, 255, size=self.coo_shape), + np.random.randint(-255, 255, size=self.coo_shape), + ).astype(dtype) + y = np.vectorize(complex)( + np.random.randint(-255, 255, size=self.coo_shape), + np.random.randint(-255, 255, size=self.coo_shape), + ).astype(dtype) + else: + x = np.random.randint( + -255, 255, size=self.coo_shape + ).astype(dtype) + y = np.random.randint( + -255, 255, size=self.coo_shape + ).astype(dtype) self.dense_x = paddle.to_tensor( x, dtype=dtype, stop_gradient=True @@ -839,19 +855,36 @@ def setUp(self): np.random.seed(2022) self.op_list = op_list self.coo_shape = [4, 8, 3, 5] - self.support_dtypes = ['float32', 'float64', 'int32', 'int64'] + self.support_dtypes = [ + 'float32', + 'float64', + 'int32', + 'int64', + 'complex64', + 'complex128', + ] def test_coo(self): if in_pir_mode(): sparse_dim = len(self.coo_shape) - 1 op = __truediv__ for dtype in self.support_dtypes: - x = np.random.randint(-255, 255, size=self.coo_shape).astype( - dtype - ) - y = np.random.randint(-255, 255, size=self.coo_shape).astype( - dtype - ) + if 'complex' in dtype: + x = np.vectorize(complex)( + np.random.randint(-255, 255, size=self.coo_shape), + np.random.randint(-255, 255, size=self.coo_shape), + ).astype(dtype) + y = np.vectorize(complex)( + np.random.randint(-255, 255, size=self.coo_shape), + np.random.randint(-255, 255, size=self.coo_shape), + ).astype(dtype) + else: + x = np.random.randint( + -255, 255, size=self.coo_shape + ).astype(dtype) + y = np.random.randint( + -255, 255, size=self.coo_shape + ).astype(dtype) self.dense_x = paddle.to_tensor( x, dtype=dtype, stop_gradient=True From f27b6b0b5abaefd81b3d5c2dbc9d9e77c09de3e7 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sun, 29 Sep 2024 21:39:43 +0800 Subject: [PATCH 015/135] fix expand as kernel bug (#68516) --- paddle/phi/kernels/gpu/expand_as_kernel.cu | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu index 6bd7cb80da28f..9712c8791ff37 100644 --- a/paddle/phi/kernels/gpu/expand_as_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu @@ -27,8 +27,14 @@ template void ExpandAsKernel(const Context& ctx, const DenseTensor& x, const paddle::optional& y, - const std::vector& target_shape, + const std::vector& target_shape_t, DenseTensor* out) { + std::vector target_shape = target_shape_t; + + if (y.get_ptr()) { + target_shape = phi::vectorize(y.get_ptr()->dims()); + } + int rank = x.dims().size(); int target_rank = static_cast(target_shape.size()); auto vec_in_dims = common::vectorize(x.dims()); From 951eeee53b94c1a0fe6dd5b6e4f7722911f57671 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sun, 29 Sep 2024 22:54:32 +0800 Subject: [PATCH 016/135] fix reduce grad shape cal bug (#68524) --- paddle/fluid/primitive/utils/utils.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/primitive/utils/utils.h b/paddle/fluid/primitive/utils/utils.h index 2e4113ae4c954..bb205a9b777b6 100644 --- a/paddle/fluid/primitive/utils/utils.h +++ b/paddle/fluid/primitive/utils/utils.h @@ -80,12 +80,14 @@ static std::vector get_expand_dims(const Tensor& origin, // This function compute unsqueeze dims for reshape to replace unsqueeze. static std::vector get_unsqueeze_dims( const Tensor& origin, const std::vector& axis) { + auto sort_axis = axis; + std::sort(sort_axis.begin(), sort_axis.end()); auto origin_dims = origin.shape(); - auto total_shape_size = origin_dims.size() + axis.size(); + auto total_shape_size = origin_dims.size() + sort_axis.size(); std::vector result; size_t j = 0, k = 0; for (size_t i = 0; i < total_shape_size; ++i) { - if (j < axis.size() && axis[j] == int64_t(i)) { + if (j < sort_axis.size() && sort_axis[j] == int64_t(i)) { result.push_back(1); j++; } else { From b08b79b1348003e16849c4a6a60e16aa20939f46 Mon Sep 17 00:00:00 2001 From: cubehan3 Date: Mon, 30 Sep 2024 10:39:33 +0800 Subject: [PATCH 017/135] fixed bug introduced by function (#68527) --- paddle/fluid/primitive/rule/vjp/details.h | 29 ++++++++++++----------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index e54371fbac5f3..87a241ab69ecb 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -656,36 +656,37 @@ void add_grad(const Tensor& x, Tensor* dx, Tensor* dy) { if (dy) { - if (!common::AreDimsWithDynamicShapeCompatible(out_grad.dims(), y.dims())) { - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { - auto dy_tmp = reduce_as(out_grad, y); - set_output(dy_tmp, dy); - } else { + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + auto dy_tmp = reduce_as(out_grad, y); + set_output(dy_tmp, dy); + } else { + if (out_grad.dims() != y.dims()) { phi::DDim reduce_dim = get_reduce_dims_from_out(out_grad.dims(), y.dims()); auto dy_reduce_res = out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false); auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); set_output(dy_tmp, dy); + } else { + by_pass(out_grad, dy); } - } else { - by_pass(out_grad, dy); } } + if (dx) { - if (!common::AreDimsWithDynamicShapeCompatible(out_grad.dims(), x.dims())) { - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { - auto dx_tmp = reduce_as(out_grad, x); - set_output(dx_tmp, dx); - } else { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + auto dx_tmp = reduce_as(out_grad, x); + set_output(dx_tmp, dx); + } else { + if (out_grad.dims() != x.dims()) { auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); auto dx_reduce_res = out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false); auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); set_output(dx_tmp, dx); + } else { + by_pass(out_grad, dx); } - } else { - by_pass(out_grad, dx); } } } From a9b45a3d1805b5a255fb851908b918b315f67992 Mon Sep 17 00:00:00 2001 From: enzodechine Date: Mon, 30 Sep 2024 13:53:22 +0800 Subject: [PATCH 018/135] [XPU] support int64_t tril & triu (#68423) --- paddle/phi/backends/xpu/xpu3_op_list.cc | 18 +++++++++++++++--- .../phi/kernels/xpu/tril_triu_grad_kernel.cc | 15 ++++++++++++--- paddle/phi/kernels/xpu/tril_triu_kernel.cc | 3 +++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc index 04bb90a4d8a18..c799be825fc9e 100644 --- a/paddle/phi/backends/xpu/xpu3_op_list.cc +++ b/paddle/phi/backends/xpu/xpu3_op_list.cc @@ -1233,33 +1233,45 @@ XPUOpMap& get_kl3_ops() { {"tril_triu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, + phi::DataType::INT64, phi::DataType::FLOAT16, phi::DataType::BFLOAT16, phi::DataType::BOOL})}, {"tril", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, + phi::DataType::INT64, phi::DataType::FLOAT16, phi::DataType::BFLOAT16, phi::DataType::BOOL})}, {"triu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, + phi::DataType::INT64, phi::DataType::FLOAT16, phi::DataType::BFLOAT16, phi::DataType::BOOL})}, {"tril_triu_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, - phi::DataType::FLOAT16})}, + phi::DataType::INT64, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16, + phi::DataType::BOOL})}, {"tril_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, - phi::DataType::FLOAT16})}, + phi::DataType::INT64, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16, + phi::DataType::BOOL})}, {"triu_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, - phi::DataType::FLOAT16})}, + phi::DataType::INT64, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16, + phi::DataType::BOOL})}, {"tile", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64, diff --git a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc index 4f672c6b60929..f90400b48baca 100644 --- a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc @@ -69,19 +69,28 @@ PD_REGISTER_KERNEL(tril_grad, ALL_LAYOUT, phi::TrilGradKernel, int, + int64_t, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16, + bool) {} PD_REGISTER_KERNEL(triu_grad, XPU, ALL_LAYOUT, phi::TriuGradKernel, int, + int64_t, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16, + bool) {} PD_REGISTER_KERNEL(tril_triu_grad, XPU, ALL_LAYOUT, phi::TrilTriuGradKernel, int, + int64_t, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16, + bool) {} diff --git a/paddle/phi/kernels/xpu/tril_triu_kernel.cc b/paddle/phi/kernels/xpu/tril_triu_kernel.cc index 26169136c9d3c..ba93b8fa8f9e0 100644 --- a/paddle/phi/kernels/xpu/tril_triu_kernel.cc +++ b/paddle/phi/kernels/xpu/tril_triu_kernel.cc @@ -69,6 +69,7 @@ PD_REGISTER_KERNEL(tril_triu, ALL_LAYOUT, phi::TrilTriuKernel, int, + int64_t, float, phi::dtype::float16, phi::dtype::bfloat16, @@ -78,6 +79,7 @@ PD_REGISTER_KERNEL(tril, ALL_LAYOUT, phi::TrilKernel, int, + int64_t, float, phi::dtype::float16, phi::dtype::bfloat16, @@ -87,6 +89,7 @@ PD_REGISTER_KERNEL(triu, ALL_LAYOUT, phi::TriuKernel, int, + int64_t, float, phi::dtype::float16, phi::dtype::bfloat16, From 3f0b373dce7b54709914b716fdfdda711356d887 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 30 Sep 2024 14:06:51 +0800 Subject: [PATCH 019/135] Flags sync for benchmark (#68510) * add FLAGS_sync_for_benchmark --- paddle/phi/api/generator/api_base.py | 4 ++++ paddle/phi/api/generator/api_gen.py | 1 + paddle/phi/api/generator/backward_api_gen.py | 1 + paddle/phi/api/generator/dist_api_gen.py | 5 +++++ paddle/phi/api/generator/dist_bw_api_gen.py | 1 + paddle/phi/api/generator/intermediate_api_gen.py | 1 + paddle/phi/api/generator/sparse_api_gen.py | 5 +++++ paddle/phi/api/generator/sparse_bw_api_gen.py | 1 + paddle/phi/api/generator/strings_api_gen.py | 5 +++++ 9 files changed, 24 insertions(+) diff --git a/paddle/phi/api/generator/api_base.py b/paddle/phi/api/generator/api_base.py index 983d7b7eed27e..45ed28b724774 100644 --- a/paddle/phi/api/generator/api_base.py +++ b/paddle/phi/api/generator/api_base.py @@ -1352,6 +1352,10 @@ def gen_kernel_code(self, kernel_name, code_indent, inplace_flag=False): {code_indent} kernel_record_event = new phi::RecordEvent(\"{kernel_name} kernel launch\", phi::TracerEventType::DygraphKernelLaunch, 1); {code_indent} }} {code_indent} (*kernel_fn)({kernel_args}, {", ".join(outputs_args)}); +{code_indent} if (FLAGS_benchmark) {{ +{code_indent} dev_ctx->Wait(); +{code_indent} std::cout << \"{kernel_name} kernel run finish.\" << std::endl; +{code_indent} }} {code_indent} if(kernel_record_event != nullptr){{ {code_indent} delete kernel_record_event; {code_indent} }} diff --git a/paddle/phi/api/generator/api_gen.py b/paddle/phi/api/generator/api_gen.py index 1499fee086f3d..9a45f10ca4c76 100644 --- a/paddle/phi/api/generator/api_gen.py +++ b/paddle/phi/api/generator/api_gen.py @@ -424,6 +424,7 @@ def source_include(header_file_path): PD_DECLARE_bool(conv2d_disable_cudnn); COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ diff --git a/paddle/phi/api/generator/backward_api_gen.py b/paddle/phi/api/generator/backward_api_gen.py index 4185f8acb32a6..38d9e52cdad55 100644 --- a/paddle/phi/api/generator/backward_api_gen.py +++ b/paddle/phi/api/generator/backward_api_gen.py @@ -291,6 +291,7 @@ def source_include(header_file_path, fw_header_file_path): PD_DECLARE_bool(conv2d_disable_cudnn); COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ diff --git a/paddle/phi/api/generator/dist_api_gen.py b/paddle/phi/api/generator/dist_api_gen.py index 71863a14cef85..276eb1b49fa0f 100644 --- a/paddle/phi/api/generator/dist_api_gen.py +++ b/paddle/phi/api/generator/dist_api_gen.py @@ -423,6 +423,10 @@ using kernel_signature = {}; auto* kernel_fn = kernel.GetVariadicKernelFn(); (*kernel_fn)({}, {}); + if (FLAGS_benchmark) {{ + dev_ctx->Wait(); + std::cout << \"{} kernel run finish.\" << std::endl; + }} if(kernel_record_event != nullptr){{ delete kernel_record_event; }} @@ -1735,6 +1739,7 @@ def generate_kernel_call_code(self) -> str: kernel_signature, ", ".join(input_args), ", ".join(self.dense_output_args), + self.api, ) global ops_infer_shape_in_runtime if self.kernel['func'][0] in ops_infer_shape_in_runtime: diff --git a/paddle/phi/api/generator/dist_bw_api_gen.py b/paddle/phi/api/generator/dist_bw_api_gen.py index e2511262ed583..383839305e6b5 100644 --- a/paddle/phi/api/generator/dist_bw_api_gen.py +++ b/paddle/phi/api/generator/dist_bw_api_gen.py @@ -524,6 +524,7 @@ def source_include(header_file_path, fw_header_file_path): PD_DECLARE_bool(conv2d_disable_cudnn); COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ diff --git a/paddle/phi/api/generator/intermediate_api_gen.py b/paddle/phi/api/generator/intermediate_api_gen.py index 4bf2e7ac8690b..4ee135ff34c00 100644 --- a/paddle/phi/api/generator/intermediate_api_gen.py +++ b/paddle/phi/api/generator/intermediate_api_gen.py @@ -63,6 +63,7 @@ def source_include(header_file_path): #endif COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ diff --git a/paddle/phi/api/generator/sparse_api_gen.py b/paddle/phi/api/generator/sparse_api_gen.py index b4c19b45b6a27..3da4059f313fb 100644 --- a/paddle/phi/api/generator/sparse_api_gen.py +++ b/paddle/phi/api/generator/sparse_api_gen.py @@ -344,6 +344,10 @@ def gen_sparse_kernel_code(self, kernel_name, inplace_flag=False): {self.gene_infer_meta(kernel_output_names, '')} {kernel_context_code} phi_kernel(&kernel_context); + if (FLAGS_benchmark) {{ + dev_ctx->Wait(); + std::cout << \"{self.api} kernel run finish.\" << std::endl; + }} {return_code}""" def get_condition_code(self, kernel_name): @@ -441,6 +445,7 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/sparse/multiary.h" COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ diff --git a/paddle/phi/api/generator/sparse_bw_api_gen.py b/paddle/phi/api/generator/sparse_bw_api_gen.py index c43c22881a43c..3f9e3c8c895cd 100644 --- a/paddle/phi/api/generator/sparse_bw_api_gen.py +++ b/paddle/phi/api/generator/sparse_bw_api_gen.py @@ -136,6 +136,7 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/sparse/backward.h" COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ diff --git a/paddle/phi/api/generator/strings_api_gen.py b/paddle/phi/api/generator/strings_api_gen.py index a677aba1dee21..c22b5a6e87b03 100644 --- a/paddle/phi/api/generator/strings_api_gen.py +++ b/paddle/phi/api/generator/strings_api_gen.py @@ -227,6 +227,10 @@ def gen_string_tensor_kernel_code(self, inplace_flag=False, code_indent=""): {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); {code_indent} (*kernel_fn)({kernel_args}, {", ".join(outputs_args)}); +{code_indent} if (FLAGS_benchmark) {{ +{code_indent} dev_ctx->Wait(); +{code_indent} std::cout << \"{self.api} kernel run finish.\" << std::endl; +{code_indent} }} {code_indent} {self.gene_return_code()}""" @@ -339,6 +343,7 @@ def source_include(header_file_path): #include "paddle/phi/core/kernel_registry.h" COMMON_DECLARE_int32(low_precision_op_list); +COMMON_DECLARE_bool(benchmark); """ From 546fe1d2e54167b2d39c24073ee8de672f4c9f48 Mon Sep 17 00:00:00 2001 From: Shuhao Liang <50269654+lshpku@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:14:23 +0800 Subject: [PATCH 020/135] [CINN] Remove hard-coded var loading in CodeGenLLVM (#68523) --- paddle/cinn/backends/codegen_invoke_module.cc | 32 -------- paddle/cinn/backends/codegen_invoke_module.h | 13 +--- paddle/cinn/backends/llvm/codegen_llvm.cc | 76 ++----------------- paddle/cinn/backends/llvm/codegen_llvm.h | 8 -- paddle/cinn/runtime/cuda/cuda_intrinsics.cc | 8 ++ paddle/cinn/runtime/cuda/cuda_util.cc | 5 ++ paddle/cinn/runtime/cuda/cuda_util.h | 2 + paddle/cinn/runtime/intrinsic.h | 5 ++ 8 files changed, 27 insertions(+), 122 deletions(-) diff --git a/paddle/cinn/backends/codegen_invoke_module.cc b/paddle/cinn/backends/codegen_invoke_module.cc index 8e9d2c3090c16..eacb13e62df31 100644 --- a/paddle/cinn/backends/codegen_invoke_module.cc +++ b/paddle/cinn/backends/codegen_invoke_module.cc @@ -62,38 +62,6 @@ llvm::Value* CodeGenInvokeModule::LowerInvokeFunc( return f_; } -llvm::Value* CodeGenInvokeModule::LowerParseArgsValueCall( - const ir::Call* call_ir) { - auto ret_type = CinnTypeToLLVMType(Int(64), m_); - std::vector args_type; - PADDLE_ENFORCE_EQ( - call_ir->read_args.size(), - 2, - ::common::errors::InvalidArgument( - "The number of arguments of ParseArgsValue should be 2")); - PADDLE_ENFORCE_EQ(call_ir->read_args[0].is_var() && - call_ir->read_args[0].as_var()->type().is_cpp_handle(), - true, - ::common::errors::InvalidArgument( - "The first read argument must be a variable " - "with a C++ handle type.")); - - PADDLE_ENFORCE_EQ(call_ir->read_args[1].type().is_int(32), - true, - ::common::errors::InvalidArgument( - "The second read argument must be of type int32.")); - args_type.push_back(CinnTypeToLLVMType(type_of(), m_)); - args_type.push_back(CinnTypeToLLVMType(type_of(), m_)); - - auto func_type = llvm::FunctionType::get(ret_type, args_type, false); - auto call_func = m_->getOrInsertFunction(call_ir->name, func_type); - - std::vector call_args; - call_args.push_back(std::addressof(*f_->arg_begin())); - call_args.push_back(b_->getInt32(call_ir->read_args[1].as_int32())); - return b_->CreateCall(call_func, call_args); -} - llvm::Value* CodeGenSwitchHost::LowerInnerCaseCall(const ir::Call* op) { std::vector ll_function_args; std::transform(f_->arg_begin(), diff --git a/paddle/cinn/backends/codegen_invoke_module.h b/paddle/cinn/backends/codegen_invoke_module.h index 7fbe9ee019f2c..c00bf1ecbad5a 100644 --- a/paddle/cinn/backends/codegen_invoke_module.h +++ b/paddle/cinn/backends/codegen_invoke_module.h @@ -43,19 +43,8 @@ class CodeGenInvokeModule : public CodeGenLLVM { return LowerInvokeFunc(func); } - llvm::Value *Visit(const ir::Call *op) override { - // TODO(Hongqing-work): change intrinsic name to get_value_in_kernel_args - if (op->name == runtime::intrinsic::get_value_in_cuda_kernel_args) { - return LowerParseArgsValueCall(op); - } else { - return CodeGenLLVM::Visit(op); - } - } - protected: llvm::Value *LowerInvokeFunc(const ir::_LoweredFunc_ *func); - - llvm::Value *LowerParseArgsValueCall(const ir::Call *call_ir); }; class CodeGenHost : public CodeGenInvokeModule { @@ -80,7 +69,7 @@ class CodeGenSwitchHost : public CodeGenInvokeModule { // only support call of args get function and inner case host function call llvm::Value *Visit(const ir::Call *op) override { if (op->name == runtime::intrinsic::get_value_in_cuda_kernel_args) { - return CodeGenInvokeModule::LowerParseArgsValueCall(op); + return CodeGenLLVM::Visit(op); } else { return LowerInnerCaseCall(op); } diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc index 6d78b700c1343..0b063e403f2a8 100644 --- a/paddle/cinn/backends/llvm/codegen_llvm.cc +++ b/paddle/cinn/backends/llvm/codegen_llvm.cc @@ -511,53 +511,6 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Cast *op) { llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) { SymbolTableGuard symbol_table_guard(*symbol_table_); - do { - break; - llvm::BasicBlock *preheader_bb = b_->GetInsertBlock(); - auto *for_begin = llvm::BasicBlock::Create( - b_->getContext(), "for_begin", b_->GetInsertBlock()->getParent()); - auto *for_body = llvm::BasicBlock::Create( - b_->getContext(), "for_body", b_->GetInsertBlock()->getParent()); - auto *for_end = llvm::BasicBlock::Create( - b_->getContext(), "for_end", b_->GetInsertBlock()->getParent()); - - Br(for_begin); - b_->SetInsertPoint(for_begin); - - auto *begin = Visit(&op->min); - auto *loop_value = PHI(begin->getType(), 2); - loop_value->addIncoming(begin, preheader_bb); - - llvm::Value *old_var = GetVar(op->loop_var->name); - SetVar(op->loop_var->name, loop_value); - auto *end = Visit(&op->extent); - CondBr(ICmpSLT(loop_value, end), for_body, for_end); - b_->SetInsertPoint(for_body); - Visit(&op->body); - - if (old_var) { - SetVar(op->loop_var->name, old_var); - } else { - symbol_table_->Erase(op->loop_var->name); - } - - auto loop_next = Add(loop_value, - llvm::ConstantInt::get(b_->getInt32Ty(), stride), - "indvar.inc", - true, - true); - loop_value->addIncoming(loop_next, b_->GetInsertBlock()); - - Br(for_begin); - b_->SetInsertPoint(for_end); - - return nullptr; - // llvm::AllocaInst *loop_var = Alloca(b_->getInt32Ty(), nullptr, - // op->loop_var->name); loop_var->setAlignment(llvm::Align(4)); - // SetVar(op->loop_var->name, loop_var); - } while (false); - - //////////////////////////////////// llvm::BasicBlock *preheader_bb = b_->GetInsertBlock(); llvm::BasicBlock *exit_bb = nullptr; @@ -814,20 +767,13 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_Module_ *op) { } llvm::Value *CodeGenLLVM::Visit(const ir::_Var_ *op) { - llvm::Value *value = GetVar(op->name, false); - llvm::Value *result{}; - CHECK(value) << "ir::_Var_[" << op->name << "]: value is null"; - // TODO(fc500110) hard coding - if (LLVM_WillVarLowerAsPointer(op->name)) { - result = value; - } else if (value->getType()->isPointerTy() && - !value->getType()->getPointerElementType()->isPointerTy()) { - result = Load(value, op->name + "_load"); - } else { - result = value; + llvm::Value *value = GetVar(op->name, /* lazy= */ false); + // When visiting a Var that is allocated on the stack, we are actually + // reading its value instead of its address. + if (llvm::AllocaInst::classof(value)) { + return Load(value, op->name + "_load"); } - - return result; + return value; } void CodeGenLLVM::Scalarize( @@ -1043,12 +989,6 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_Buffer_ *op) { llvm::Value *CodeGenLLVM::Visit(const ir::_Tensor_ *op) { return GetVar(op->name); - auto *buffer_op = op->buffer.As(); - if (symbol_table_->Lookup(buffer_op->name)) { - return Visit(buffer_op); - } - - return SetVar(buffer_op->name, Visit(buffer_op)); } template { #undef __m }; -/** - * Tell whether a variable called \p \var_name will lowered to a pointer type in - * LLVM. - * @param var_name name of the variable. - * @return a boolean. - */ -bool LLVM_WillVarLowerAsPointer(const std::string &var_name); - class SymbolTable { public: SymbolTable() = default; diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc index 941bb9f0172e2..f712307a2bda5 100644 --- a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc +++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc @@ -433,6 +433,14 @@ CINN_REGISTER_HELPER(cinn_cuda_host_api) { .AddInputType() // index .End(); + using cinn::runtime::cuda::cinn_get_item_in_cuda_kernel_args; + REGISTER_EXTERN_FUNC_HELPER(cinn_get_item_in_cuda_kernel_args, + cinn::common::DefaultHostTarget()) + .SetRetType() + .AddInputType() // args + .AddInputType() // index + .End(); + using cinn::runtime::cuda::infer_shape_set_value; REGISTER_EXTERN_FUNC_HELPER(infer_shape_set_value, cinn::common::DefaultHostTarget()) diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc index 4d9a34d2ddbe6..a0c12732a4ad5 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.cc +++ b/paddle/cinn/runtime/cuda/cuda_util.cc @@ -83,6 +83,11 @@ int64_t cinn_get_value_in_cuda_kernel_args(void *v_args, int idx) { return args[idx].operator int64_t(); } +void *cinn_get_item_in_cuda_kernel_args(void *v_args, int idx) { + cinn_pod_value_t *args = static_cast(v_args); + return static_cast(&args[idx]); +} + void cinn_call_cuda_kernel(void *kernel_fn, void *v_args, int num_args, diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h index 21944c5d8a9c4..592ef50343bf0 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.h +++ b/paddle/cinn/runtime/cuda/cuda_util.h @@ -90,6 +90,8 @@ void cinn_call_cuda_memcpy(void* v_args, void* stream = nullptr); int64_t cinn_get_value_in_cuda_kernel_args(void* v_args, int idx); +void* cinn_get_item_in_cuda_kernel_args(void* v_args, int idx); + void infer_shape_set_value(int row, int col, int64_t value, int64_t** v); /** diff --git a/paddle/cinn/runtime/intrinsic.h b/paddle/cinn/runtime/intrinsic.h index 1c02ac6de5696..6d81caf3102dd 100644 --- a/paddle/cinn/runtime/intrinsic.h +++ b/paddle/cinn/runtime/intrinsic.h @@ -106,9 +106,14 @@ static const char* call_cuda_kernel = "cinn_call_cuda_kernel"; static const char* call_hip_kernel = "cinn_call_hip_kernel"; +static const char* call_cuda_memset = "cinn_call_cuda_memset"; + static const char* get_value_in_cuda_kernel_args = "cinn_get_value_in_cuda_kernel_args"; +static const char* get_item_in_cuda_kernel_args = + "cinn_get_item_in_cuda_kernel_args"; + static const char* infer_shape_set_value = "infer_shape_set_value"; static const char* pod_values_to_array_repr = "pod_values_to_array"; From c4942d6d444ca5db90b78d66c0d896ef789806a3 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:41:27 +0800 Subject: [PATCH 021/135] add arange forward only trait (#68535) --- paddle/phi/ops/yaml/legacy/static_ops.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml index 00a441fffae68..2c32090a7ec4f 100755 --- a/paddle/phi/ops/yaml/legacy/static_ops.yaml +++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml @@ -69,6 +69,7 @@ func : arange_tensor data_transform : skip_transform : start, end, step + traits : paddle::dialect::ForwardOnlyTrait - op : assign args : (Tensor x) From 6e23b1a89f62e1047fa8e6d3ff1ef6665ea7f5d5 Mon Sep 17 00:00:00 2001 From: cubehan3 Date: Mon, 30 Sep 2024 15:53:09 +0800 Subject: [PATCH 022/135] polish code style for half-precision cast (#68536) --- paddle/fluid/primitive/composite/composite.h | 268 ++++--------------- paddle/fluid/primitive/rule/vjp/details.h | 157 +++-------- paddle/fluid/primitive/utils/utils.h | 20 ++ 3 files changed, 109 insertions(+), 336 deletions(-) diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index 9887ea1836092..f9b143a696114 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -36,13 +36,8 @@ Tensor any_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { template Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { - auto org_dtype = x.dtype(); - auto x_tmp = x; + auto x_tmp = ConverToMT(x); - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_tmp = cast(x, DataType::FLOAT32); - } std::vector x_dim = x_tmp.shape(); int64_t axis_size = axis.size(); int64_t x_dim_size = x_dim.size(); @@ -87,11 +82,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { Tensor res = sum_x / value; - if (need_cast) { - return cast(res, org_dtype); - } else { - return res; - } + return ConverToOrig(res, x.dtype()); } static void check_valid_type(const DataType& dtype) { @@ -121,13 +112,7 @@ Tensor p_norm_decomp(const Tensor& x, const float epsilon = 1.0e-12f, const bool& keepdim = false, const bool& asvector = false) { - auto org_dtype = x.dtype(); - auto x_tmp = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_tmp = cast(x, DataType::FLOAT32); - } + auto x_tmp = ConverToMT(x); Tensor res; if (porder == 0.0) { @@ -160,31 +145,17 @@ Tensor p_norm_decomp(const Tensor& x, res = elementwise_pow(res, inv_porder_tensor); } - if (need_cast) { - return cast(res, org_dtype); - } else { - return res; - } + return ConverToOrig(res, x.dtype()); } template Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) { - auto org_dtype = x.dtype(); - auto x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } + auto x_cast = ConverToMT(x); check_valid_type(y.dtype()); Tensor y_full = full_scalar(y, x_cast.dtype()); auto ans = elementwise_pow(x_cast, y_full); - if (need_cast) { - return cast(ans, org_dtype); - } else { - return ans; - } + return ConverToOrig(ans, x.dtype()); } template @@ -302,12 +273,8 @@ std::tuple batch_norm_decomp( bool use_global_stats, bool trainable_statistics) { auto org_dtype = x.dtype(); - Tensor x_cast = x; + Tensor x_cast = ConverToMT(x); - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } int rank = x_cast.shape().size(); DataLayout data_layout_ = common::StringToDataLayout(data_layout); int feature_axis; @@ -393,9 +360,7 @@ std::tuple batch_norm_decomp( } batch_mean_ = assign(batch_mean); inv_std_ = assign(inv_std); - if (need_cast) { - y = cast(y, org_dtype); - } + y = ConverToOrig(y, org_dtype); } else { std::vector x_dim = x_cast.shape(); std::vector stats_shape; @@ -450,9 +415,7 @@ std::tuple batch_norm_decomp( batch_mean_ = assign(batch_mean); inv_std_ = assign(inv_std); - if (need_cast) { - y = cast(y, org_dtype); - } + y = ConverToOrig(y, org_dtype); } if (!use_run_stat) { return std::make_tuple( @@ -467,45 +430,25 @@ std::tuple batch_norm_decomp( template Tensor softmax_decomp(const Tensor& x, const int& axis) { - auto org_dtype = x.dtype(); - auto x_tmp = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_tmp = cast(x, DataType::FLOAT32); - } + auto x_tmp = ConverToMT(x); auto max_tmp = max(x_tmp, {axis}, true); auto molecular = exp(x_tmp - max_tmp); auto res = molecular / sum(molecular, {axis}, molecular.dtype(), true); - if (need_cast) { - return cast(res, org_dtype); - } else { - return res; - } + return ConverToOrig(res, x.dtype()); } template Tensor log_softmax_decomp(const Tensor& x, const int& axis) { - auto org_dtype = x.dtype(); - auto x_tmp = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_tmp = cast(x, DataType::FLOAT32); - } + auto x_tmp = ConverToMT(x); auto max_tmp = max(x_tmp, {axis}, true); auto sub = x_tmp - max_tmp; auto molecular = exp(sub); auto res = sub - log(sum(molecular, {axis}, molecular.dtype(), true)); - if (need_cast) { - return cast(res, org_dtype); - } else { - return res; - } + return ConverToOrig(res, x.dtype()); } template @@ -560,19 +503,9 @@ Tensor stack_decomp(const std::vector& x, const int& axis) { template Tensor silu_decomp(const Tensor& x) { - auto org_dtype = x.dtype(); - auto x_tmp = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_tmp = cast(x, DataType::FLOAT32); - } + auto x_tmp = ConverToMT(x); auto res = x_tmp * sigmoid(x_tmp); - if (need_cast) { - return cast(res, org_dtype); - } else { - return res; - } + return ConverToOrig(res, x.dtype()); } template @@ -700,14 +633,7 @@ std::tuple layer_norm_decomp( if (has_dynamic_shape(x.shape())) { std::vector axis; auto org_dtype = x.dtype(); - Tensor x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - - // cast dtype to float32 if dtype =float16 or bfloat16 - if (need_cast) { - x_cast = cast(x_cast, DataType::FLOAT32); - } + Tensor x_cast = ConverToMT(x); auto x_dim = x.shape(); for (size_t i = begin_norm_axis; i < x_dim.size(); i++) { @@ -727,17 +653,13 @@ std::tuple layer_norm_decomp( Tensor scale_cast; if (scale) { scale_cast = backend::reshape_with_tensor(scale.get(), slice_shape_r); - if (need_cast) { - scale_cast = cast(scale_cast, DataType::FLOAT32); - } + scale_cast = ConverToMT(scale_cast); out = out * scale_cast; } Tensor bias_cast; if (bias) { bias_cast = backend::reshape_with_tensor(bias.get(), slice_shape_r); - if (need_cast) { - bias_cast = cast(bias_cast, DataType::FLOAT32); - } + bias_cast = ConverToMT(bias_cast); out = out + bias_cast; } mean_ = backend::reshape_with_tensor(mean_, slice_shape_l); @@ -746,23 +668,13 @@ std::tuple layer_norm_decomp( // same as LayerNormInferMeta // x: float32 --> out: float32, mean: float32, variance: float32 // x: float16 --> out: float16, mean: float32, variance: float32 - if (need_cast) { - out = cast(out, org_dtype); - } - + out = ConverToOrig(out, org_dtype); return std::make_tuple(out, mean_, variance); } std::vector axis; auto org_dtype = x.dtype(); - Tensor x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - - // cast dtype to float32 if dtype =float16 or bfloat16 - if (need_cast) { - x_cast = cast(x_cast, DataType::FLOAT32); - } + Tensor x_cast = ConverToMT(x); auto x_dim = x.shape(); for (size_t i = begin_norm_axis; i < x_dim.size(); i++) { @@ -788,17 +700,13 @@ std::tuple layer_norm_decomp( Tensor scale_cast; if (scale) { scale_cast = reshape(scale.get(), slice_shape_r); - if (need_cast) { - scale_cast = cast(scale_cast, DataType::FLOAT32); - } + scale_cast = ConverToMT(scale_cast); out = out * scale_cast; } Tensor bias_cast; if (bias) { bias_cast = reshape(bias.get(), slice_shape_r); - if (need_cast) { - bias_cast = cast(bias_cast, DataType::FLOAT32); - } + bias_cast = ConverToMT(bias_cast); out = out + bias_cast; } mean_ = reshape(mean_, slice_shape_l); @@ -807,10 +715,7 @@ std::tuple layer_norm_decomp( // same as LayerNormInferMeta // x: float32 --> out: float32, mean: float32, variance: float32 // x: float16 --> out: float16, mean: float32, variance: float32 - if (need_cast) { - out = cast(out, org_dtype); - } - + out = ConverToOrig(out, org_dtype); return std::make_tuple(out, mean_, variance); } @@ -990,12 +895,7 @@ std::tuple instance_norm_decomp( float epsilon) { if (has_dynamic_shape(x.shape())) { auto org_dtype = x.dtype(); - Tensor x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } + Tensor x_cast = ConverToMT(x); std::vector axis; auto x_dim = x.shape(); @@ -1032,18 +932,14 @@ std::tuple instance_norm_decomp( if (scale) { scale_cast = backend::reshape_with_tensor(scale.get(), slice_shape_tensor); - if (need_cast) { - scale_cast = cast(scale_cast, DataType::FLOAT32); - } + scale_cast = ConverToMT(scale_cast); out = out * scale_cast; } Tensor bias_cast; if (bias) { bias_cast = backend::reshape_with_tensor(bias.get(), slice_shape_tensor); - if (need_cast) { - bias_cast = cast(bias_cast, DataType::FLOAT32); - } + bias_cast = ConverToMT(bias_cast); out = out + bias_cast; } @@ -1052,21 +948,12 @@ std::tuple instance_norm_decomp( auto variance_out = reshape(rsqrt_var, res_shape); Tensor res; - if (need_cast) { - res = cast(out, org_dtype); - } else { - res = out; - } + res = ConverToOrig(out, org_dtype); return std::make_tuple(res, mean_out, variance_out); } auto org_dtype = x.dtype(); - Tensor x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } + Tensor x_cast = ConverToMT(x); std::vector axis; auto x_dim = x.shape(); @@ -1090,17 +977,13 @@ std::tuple instance_norm_decomp( Tensor scale_cast; if (scale) { scale_cast = reshape(scale.get(), slice_shape); - if (need_cast) { - scale_cast = cast(scale_cast, DataType::FLOAT32); - } + scale_cast = ConverToMT(scale_cast); out = out * scale_cast; } Tensor bias_cast; if (bias) { bias_cast = reshape(bias.get(), slice_shape); - if (need_cast) { - bias_cast = cast(bias_cast, DataType::FLOAT32); - } + bias_cast = ConverToMT(bias_cast); out = out + bias_cast; } @@ -1109,12 +992,7 @@ std::tuple instance_norm_decomp( auto variance_out = reshape(rsqrt_var, res_shape); Tensor res; - if (need_cast) { - res = cast(out, org_dtype); - } else { - res = out; - } - + res = ConverToOrig(out, org_dtype); return std::make_tuple(res, mean_out, variance_out); } @@ -1253,12 +1131,7 @@ std::tuple group_norm_decomp( } auto org_dtype = x.dtype(); - Tensor x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } + Tensor x_cast = ConverToMT(x); Tensor x_dim_t; Tensor out, mean_, var_; @@ -1316,9 +1189,7 @@ std::tuple group_norm_decomp( } else { scale_cast = scale.get(); } - if (need_cast) { - scale_cast = cast(scale_cast, DataType::FLOAT32); - } + scale_cast = ConverToMT(scale_cast); out = out * scale_cast; } Tensor bias_cast; @@ -1328,9 +1199,7 @@ std::tuple group_norm_decomp( } else { bias_cast = bias.get(); } - if (need_cast) { - bias_cast = cast(bias_cast, DataType::FLOAT32); - } + bias_cast = ConverToMT(bias_cast); out = out + bias_cast; } Tensor mean_out, var_out; @@ -1345,32 +1214,20 @@ std::tuple group_norm_decomp( mean_out = reshape(mean_, res_shape); var_out = reshape(var_, res_shape); } - if (need_cast) { - out = cast(out, org_dtype); - } + out = ConverToOrig(out, org_dtype); return std::make_tuple(out, mean_out, var_out); } template Tensor square_decomp(const Tensor& x) { - auto org_dtype = x.dtype(); - auto x_cast = x; - - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } + auto x_cast = ConverToMT(x); Tensor two; two = full_scalar(2, x_cast.dtype()); auto ans = elementwise_pow(x_cast, two); - if (need_cast) { - return cast(ans, org_dtype); - } else { - return ans; - } + return ConverToOrig(ans, x.dtype()); } template @@ -1416,13 +1273,8 @@ Tensor sigmoid_cross_entropy_with_logits_decomp( template Tensor mean_all_decomp(const Tensor& x) { - auto org_dtype = x.dtype(); - auto x_cast = x; + auto x_cast = ConverToMT(x); auto x_shape = x.shape(); - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } Tensor ans; if (has_dynamic_shape(x_shape)) { @@ -1437,11 +1289,7 @@ Tensor mean_all_decomp(const Tensor& x) { ans = sum(x_cast) / x_cast.numel(); } - if (need_cast) { - return cast(ans, org_dtype); - } else { - return ans; - } + return ConverToOrig(ans, x.dtype()); } template @@ -1536,13 +1384,8 @@ Tensor index_sample_decomp(const Tensor& x, const Tensor& index) { template Tensor elu_decomp(const Tensor& x, const float alpha) { - auto org_dtype = x.dtype(); - auto x_cast = x; + auto x_cast = ConverToMT(x); - bool need_cast = is_half_dtype(org_dtype); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - } Tensor zero; Tensor tmp_res; @@ -1555,37 +1398,16 @@ Tensor elu_decomp(const Tensor& x, const float alpha) { tmp_res = alpha * (exp(x_cast) - 1); } auto ans = where(x_cast > zero, x_cast, tmp_res); - if (need_cast) { - return cast(ans, org_dtype); - } else { - return ans; - } + return ConverToOrig(ans, x.dtype()); } template Tensor lerp_decomp(const Tensor& x, const Tensor& y, const Tensor& weight) { - Tensor x_cast = x; - Tensor y_cast = y; - Tensor weight_cast = weight; - bool need_cast = false; - if (is_half_dtype(x.dtype())) { - need_cast = true; - x_cast = cast(x, DataType::FLOAT32); - } - if (is_half_dtype(y.dtype())) { - need_cast = true; - y_cast = cast(y, DataType::FLOAT32); - } - if (is_half_dtype(weight.dtype())) { - need_cast = true; - weight_cast = cast(weight, DataType::FLOAT32); - } + Tensor x_cast = ConverToMT(x); + Tensor y_cast = ConverToMT(y); + Tensor weight_cast = ConverToMT(weight); Tensor res = x_cast + weight_cast * (y_cast - x_cast); - if (need_cast) { - return cast(res, x.dtype()); - } else { - return res; - } + return ConverToOrig(res, x.dtype()); } template diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index 87a241ab69ecb..091d780d488db 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -45,25 +45,6 @@ void assign_grad(const Tensor& out_grad, Tensor* x_grad) { } } -template -Tensor ConverToMT(const Tensor& x) { - bool need_cast = x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16; - if (need_cast) { - return cast(x, phi::DataType::FLOAT32); - } - return x; -} - -template -Tensor ConverToOrig(const Tensor& out, phi::DataType input_dtype) { - bool need_cast = out.dtype() != input_dtype; - if (need_cast) { - return cast(out, input_dtype); - } - return out; -} - template void bce_loss_grad(const Tensor& input, const Tensor& label, @@ -349,82 +330,46 @@ void gelu_grad(const Tensor& x, bool approximate, Tensor* x_grad) { if (!x_grad) return; - // Promote to fp32 when the input type is fp16 for keeping consistent with - // phi kernel - - if (is_half_dtype(x.dtype())) { - auto promoted_x = cast(x, phi::DataType::FLOAT32); - auto promoted_out_grad = cast(out_grad, phi::DataType::FLOAT32); - if (approximate) { - float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5; - float kkappa = 0.044715; - Tensor kbeta_ = full_scalar(kbeta, promoted_x.dtype()); - Tensor kkappa_ = full_scalar(kkappa, promoted_x.dtype()); - - auto x_sq = promoted_x * promoted_x; - auto x_cube = x_sq * promoted_x; - auto inner = kbeta_ * (promoted_x + kkappa_ * x_cube); - auto tanh_inner = tanh(inner); - - auto left = scale(promoted_x, 0.5); - auto right = scale(tanh_inner, 1., 1.); - - auto left_derivative = scale(right, 0.5); - - auto tanh_derivative = scale(tanh_inner * tanh_inner, -1., 1.); - auto inner_derivative = kbeta_ * (scale(3 * kkappa_ * x_sq, 1., 1.)); - auto right_derivative = left * tanh_derivative * inner_derivative; - - set_output( - cast(promoted_out_grad * (left_derivative + right_derivative), - x.type()), - x_grad); - } else { - float kalpha = M_SQRT1_2; - float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; - Tensor kalpha_ = full_scalar(kalpha, promoted_x.dtype()); - Tensor kbeta_ = full_scalar(kbeta, promoted_x.dtype()); - - auto cdf = scale(scale(erf(kalpha_ * promoted_x), 1., 1.), 0.5); - auto pdf = kbeta_ * exp(scale(promoted_x * promoted_x, -0.5)); - set_output( - cast(promoted_out_grad * (cdf + promoted_x * pdf), x.type()), - x_grad); - } + // Automatically promote to fp32 when the input type is fp16 for keeping + // consistent with phi kernel + + auto promoted_x = ConverToMT(x); + auto promoted_out_grad = ConverToMT(out_grad); + if (approximate) { + float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5; + float kkappa = 0.044715; + Tensor kbeta_ = full_scalar(kbeta, promoted_x.dtype()); + Tensor kkappa_ = full_scalar(kkappa, promoted_x.dtype()); + + auto x_sq = promoted_x * promoted_x; + auto x_cube = x_sq * promoted_x; + auto inner = kbeta_ * (promoted_x + kkappa_ * x_cube); + auto tanh_inner = tanh(inner); + + auto left = scale(promoted_x, 0.5); + auto right = scale(tanh_inner, 1., 1.); + + auto left_derivative = scale(right, 0.5); + + auto tanh_derivative = scale(tanh_inner * tanh_inner, -1., 1.); + auto inner_derivative = kbeta_ * (scale(3 * kkappa_ * x_sq, 1., 1.)); + auto right_derivative = left * tanh_derivative * inner_derivative; + + set_output( + ConverToOrig( + promoted_out_grad * (left_derivative + right_derivative), x.type()), + x_grad); } else { - // Scale only support fp32 attr in static graph mode, use elementwise_xx - // when precision is over fp32. - if (approximate) { - auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; - auto kKappa = 0.044715; - Tensor kBeta_ = full_scalar(kBeta, x.dtype()); - Tensor kKappa_ = full_scalar(kKappa, x.dtype()); + float kalpha = M_SQRT1_2; + float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + Tensor kalpha_ = full_scalar(kalpha, promoted_x.dtype()); + Tensor kbeta_ = full_scalar(kbeta, promoted_x.dtype()); - auto x_sq = x * x; - auto x_cube = x_sq * x; - auto inner = kBeta_ * (x + kKappa_ * x_cube); - auto tanh_inner = tanh(inner); - - auto left = scale(x, 0.5); - auto right = scale(tanh_inner, 1., 1.); - - auto left_derivative = scale(right, 0.5); - - auto tanh_derivative = scale(tanh_inner * tanh_inner, -1., 1.); - auto inner_derivative = kBeta_ * (scale(3 * kKappa_ * x_sq, 1., 1.)); - auto right_derivative = left * tanh_derivative * inner_derivative; - - set_output(out_grad * (left_derivative + right_derivative), x_grad); - } else { - auto kAlpha = M_SQRT1_2; - auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; - Tensor kAlpha_ = full_scalar(kAlpha, x.dtype()); - Tensor kBeta_ = full_scalar(kBeta, x.dtype()); - - auto cdf = scale(scale(erf(kAlpha_ * x), 1., 1.), 0.5); - auto pdf = kBeta_ * exp(scale(x * x, -0.5)); - set_output(out_grad * (cdf + x * pdf), x_grad); - } + auto cdf = scale(scale(erf(kalpha_ * promoted_x), 1., 1.), 0.5); + auto pdf = kbeta_ * exp(scale(promoted_x * promoted_x, -0.5)); + set_output( + ConverToOrig(promoted_out_grad * (cdf + promoted_x * pdf), x.type()), + x_grad); } } @@ -1400,12 +1345,8 @@ void masked_select_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) { if (x_grad) { - auto promoted_x = x; - auto promoted_out_grad = out_grad; - if (is_half_dtype(x.dtype())) { - promoted_x = cast(x, DataType::FLOAT32); - promoted_out_grad = cast(out_grad, DataType::FLOAT32); - } + auto promoted_x = ConverToMT(x); + auto promoted_out_grad = ConverToMT(out_grad); auto x_num = 1; for (size_t i = 0; i < promoted_x.shape().size(); i++) { @@ -2853,16 +2794,10 @@ void logcumsumexp_grad(const Tensor& x, if (x_grad) { reverse = !reverse; Tensor tmp, lowest, x_grad_tmp; - Tensor x_cast = x; - Tensor out_cast = out; - Tensor out_grad_cast = out_grad; - bool need_cast = is_half_dtype(x.dtype()); + Tensor x_cast = ConverToMT(x); + Tensor out_cast = ConverToMT(out); + Tensor out_grad_cast = ConverToMT(out_grad); - if (need_cast) { - x_cast = cast(x, DataType::FLOAT32); - out_cast = cast(out, DataType::FLOAT32); - out_grad_cast = cast(out_grad, DataType::FLOAT32); - } const Tensor out_grad_log = log(abs(out_grad_cast)); auto out_grad_dtype = out_grad_cast.dtype(); @@ -2931,11 +2866,7 @@ void logcumsumexp_grad(const Tensor& x, x_grad_tmp = reshape(out_grad_pos - out_grad_neg, x_cast.shape()); } - if (need_cast) { - set_output(cast(x_grad_tmp, x.dtype()), x_grad); - } else { - set_output(x_grad_tmp, x_grad); - } + set_output(ConverToOrig(x_grad_tmp, x.dtype()), x_grad); } } diff --git a/paddle/fluid/primitive/utils/utils.h b/paddle/fluid/primitive/utils/utils.h index bb205a9b777b6..a122d2c43c630 100644 --- a/paddle/fluid/primitive/utils/utils.h +++ b/paddle/fluid/primitive/utils/utils.h @@ -283,5 +283,25 @@ static bool has_dynamic_shape(const std::vector& shape, return flag; } +template +Tensor ConverToMT(const Tensor& x) { + bool need_cast = x.dtype() == phi::DataType::FLOAT16 || + x.dtype() == phi::DataType::BFLOAT16 || + x.dtype() == phi::DataType::UINT16; + if (need_cast) { + return cast(x, phi::DataType::FLOAT32); + } + return x; +} + +template +Tensor ConverToOrig(const Tensor& out, phi::DataType input_dtype) { + bool need_cast = out.dtype() != input_dtype; + if (need_cast) { + return cast(out, input_dtype); + } + return out; +} + } // namespace primitive } // namespace paddle From e549437ad8bda6ff55ed22039eb09bafa74f78ba Mon Sep 17 00:00:00 2001 From: Chang Lu <55493212+AndSonder@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:53:30 +0800 Subject: [PATCH 023/135] temp fix (#68533) --- .../distributed/auto_parallel/static/mix_to_dist_pass.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/paddle/distributed/auto_parallel/static/mix_to_dist_pass.py b/python/paddle/distributed/auto_parallel/static/mix_to_dist_pass.py index e3b55684fd890..e4fab9c6556e8 100644 --- a/python/paddle/distributed/auto_parallel/static/mix_to_dist_pass.py +++ b/python/paddle/distributed/auto_parallel/static/mix_to_dist_pass.py @@ -34,6 +34,12 @@ def verify_dist_block(block): if op.name() == "dist_op.shard_tensor": raise RuntimeError("Block still contain shard_tensor_op.") if op.dist_attr is None: + # Note (luchang): Temp fix, remove unused parameter 'op'. + # Will be removed in the future. + if op.name() == "builtin.parameter": + if op.result(0).use_empty(): + op.erase() + continue raise RuntimeError( f"The op {op} does not have OperatorDistAttr after Mix2Dist Pass." ) From 438e66b9046c564bfaff92dec7e7a7c4bffbf18a Mon Sep 17 00:00:00 2001 From: skywalker2012 <108259496+skywalker2012@users.noreply.github.com> Date: Tue, 1 Oct 2024 08:53:29 +0800 Subject: [PATCH 024/135] [XPU] support argmin bf16 (#68453) * support argmin bf16 * convert_float_to_uint16 * pre-commit modify --- paddle/phi/backends/xpu/xpu3_op_list.cc | 4 +++- paddle/phi/kernels/xpu/arg_min_max_kernel.cc | 9 +++++++-- test/xpu/test_arg_min_op_xpu.py | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc index c799be825fc9e..5c442fde21c51 100644 --- a/paddle/phi/backends/xpu/xpu3_op_list.cc +++ b/paddle/phi/backends/xpu/xpu3_op_list.cc @@ -49,7 +49,9 @@ XPUOpMap& get_kl3_ops() { phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"arg_min", - XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"argsort_grad", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64, diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc index 693e0ba8070ed..3152116a49a77 100644 --- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc +++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc @@ -196,7 +196,12 @@ PD_REGISTER_KERNEL(argmax, kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } -PD_REGISTER_KERNEL( - argmin, XPU, ALL_LAYOUT, phi::ArgMinKernel, float, phi::dtype::float16) { +PD_REGISTER_KERNEL(argmin, + XPU, + ALL_LAYOUT, + phi::ArgMinKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/test/xpu/test_arg_min_op_xpu.py b/test/xpu/test_arg_min_op_xpu.py index b1dcb2e93dcc4..5d1de55037c6b 100644 --- a/test/xpu/test_arg_min_op_xpu.py +++ b/test/xpu/test_arg_min_op_xpu.py @@ -20,6 +20,7 @@ create_test_class, get_xpu_op_support_types, ) +from op_test import convert_float_to_uint16 from op_test_xpu import XPUOpTest import paddle @@ -41,8 +42,17 @@ def setUp(self): self.dtype = self.in_type self.initTestCase() - self.x = (np.random.random(self.dims)).astype(self.dtype) - self.inputs = {'X': self.x} + self.x = (np.random.random(self.dims)).astype( + self.dtype if self.dtype != np.uint16 else np.float32 + ) + + self.inputs = { + 'X': ( + self.x + if self.dtype != np.uint16 + else convert_float_to_uint16(self.x) + ) + } self.attrs = {'axis': self.axis, 'use_xpu': True} self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} From 554f472c7b9abd25660aaaface7beedcd930aeb3 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 1 Oct 2024 18:34:04 +0800 Subject: [PATCH 025/135] [DLPACK] Setting device_id in FromDLPACK (#68549) --- paddle/fluid/framework/tensor_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 111d3e96f686c..48599fb4e18e7 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -908,7 +908,7 @@ phi::DenseTensor TensorFromDLPack(DLManagedTensor* src, Deleter deleter) { if (src->dl_tensor.device.device_type == kDLCPU) { place = phi::CPUPlace(); } else if (src->dl_tensor.device.device_type == kDLCUDA) { - place = phi::GPUPlace(); + place = phi::GPUPlace(src->dl_tensor.device.device_id); } else if (src->dl_tensor.device.device_type == kDLCUDAHost) { place = phi::GPUPinnedPlace(); } else { From 21ec722e28ecc52899e6cae840e3b9b6c4b61f74 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 8 Oct 2024 10:40:31 +0800 Subject: [PATCH 026/135] Support builtin `__abs__` method for Tensor/Variable/Value (#68511) --- python/paddle/base/dygraph/math_op_patch.py | 4 ++ python/paddle/base/layers/math_op_patch.py | 4 ++ python/paddle/optimizer/lbfgs.py | 13 ++---- python/paddle/pir/math_op_patch.py | 4 ++ test/legacy_test/test_math_op_patch.py | 34 +++++++++++++++ test/legacy_test/test_math_op_patch_pir.py | 42 +++++++++++++++++++ .../test_math_op_patch_var_base.py | 20 +++++++++ 7 files changed, 112 insertions(+), 9 deletions(-) diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py index 467b58e0e583f..24d593238c5ec 100644 --- a/python/paddle/base/dygraph/math_op_patch.py +++ b/python/paddle/base/dygraph/math_op_patch.py @@ -107,6 +107,9 @@ def _scalar_elementwise_op_( def _neg_(var: Tensor) -> Tensor: return _scalar_elementwise_op_(var, -1.0, 0.0) + def _abs_(var: Tensor) -> Tensor: + return var.abs() + def _float_(var: Tensor) -> float: numel = np.prod(var.shape) assert ( @@ -188,6 +191,7 @@ def _T_(var: Tensor) -> Tensor: eager_methods = [ ('__neg__', _neg_), + ('__abs__', _abs_), ('__float__', _float_), ('__long__', _long_), ('__int__', _int_), diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py index cd58b8cf9597b..a24356a3bcf54 100644 --- a/python/paddle/base/layers/math_op_patch.py +++ b/python/paddle/base/layers/math_op_patch.py @@ -482,6 +482,9 @@ def _scalar_op_(var, scale, bias): def _neg_(var): return _scalar_op_(var, -1.0, 0.0) + def _abs_(var): + return paddle.abs(var) + @property def _ndim(self): """ @@ -778,6 +781,7 @@ def to_dense(var): variable_methods = [ # b=-a ('__neg__', _neg_), + ('__abs__', _abs_), ('astype', astype), ('cpu', cpu), ('cuda', cuda), diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py index 3aa0c2c4cd079..7e29209787577 100644 --- a/python/paddle/optimizer/lbfgs.py +++ b/python/paddle/optimizer/lbfgs.py @@ -209,7 +209,7 @@ def _strong_wolfe( bracket_gtd = [gtd_prev, gtd_new] break - if paddle.abs(gtd_new) <= -c2 * gtd: + if abs(gtd_new) <= -c2 * gtd: bracket = [alpha] bracket_f = [loss_new] bracket_g = [grad_new] @@ -262,10 +262,7 @@ def _strong_wolfe( low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0) while not done and ls_iter < max_ls: # line-search bracket is so small - bracket_ls = bracket[1] - bracket[0] - if not isinstance(bracket_ls, paddle.Tensor): - bracket_ls = paddle.to_tensor(bracket_ls, dtype=gtd_new.dtype) - if paddle.abs(bracket_ls) * d_norm < tolerance_change: + if abs(bracket[1] - bracket[0]) * d_norm < tolerance_change: break # compute new trial value @@ -291,9 +288,7 @@ def _strong_wolfe( # interpolation close to boundary if insuf_progress or alpha >= max(bracket) or alpha <= min(bracket): # evaluate at 0.1 away from boundary - if paddle.abs(alpha - max(bracket)) < paddle.abs( - alpha - min(bracket) - ): + if abs(alpha - max(bracket)) < abs(alpha - min(bracket)): alpha = max(bracket) - eps else: alpha = min(bracket) + eps @@ -321,7 +316,7 @@ def _strong_wolfe( (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0) ) else: - if paddle.abs(gtd_new) <= -c2 * gtd: + if abs(gtd_new) <= -c2 * gtd: # Wolfe conditions satisfied done = True elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0: diff --git a/python/paddle/pir/math_op_patch.py b/python/paddle/pir/math_op_patch.py index 8d43e53c95239..62d0fb21a9e33 100644 --- a/python/paddle/pir/math_op_patch.py +++ b/python/paddle/pir/math_op_patch.py @@ -334,6 +334,9 @@ def _scalar_div_(var, value): def _scalar_neg_(var): return paddle.scale(var, -1.0, 0.0) + def _scalar_abs_(var): + return paddle.abs(var) + def _binary_creator_( method_name, python_api, @@ -1063,6 +1066,7 @@ def register_hook(self, hook): _binary_creator_('__matmul__', paddle.tensor.matmul, False, None), ), ('__neg__', _scalar_neg_), + ('__abs__', _scalar_abs_), # For compare operators ( '__eq__', diff --git a/test/legacy_test/test_math_op_patch.py b/test/legacy_test/test_math_op_patch.py index 125c2219693e2..ec74e06acb2d5 100644 --- a/test/legacy_test/test_math_op_patch.py +++ b/test/legacy_test/test_math_op_patch.py @@ -245,6 +245,40 @@ def test_neg(self): ) np.testing.assert_allclose(-a_np, b_np, rtol=1e-05) + @prog_scope() + def test_abs(self): + # test for real number + a = paddle.static.data(name="a", shape=[-1, 10, 1], dtype='float32') + if not paddle.framework.use_pir_api(): + a.desc.set_need_check_feed(False) + b = abs(a) # call __abs__ + place = base.CPUPlace() + exe = base.Executor(place) + a_np = np.random.uniform(-1, 1, size=[10, 1]).astype('float32') + + (b_np,) = exe.run( + base.default_main_program(), feed={"a": a_np}, fetch_list=[b] + ) + np.testing.assert_allclose(np.abs(a_np), b_np, rtol=1e-05) + + @prog_scope() + def test_abs_complex(self): + # test for complex number + a = paddle.static.data(name="a", shape=[-1, 10, 1], dtype='complex64') + if not paddle.framework.use_pir_api(): + a.desc.set_need_check_feed(False) + b = abs(a) # call __abs__ + place = base.CPUPlace() + exe = base.Executor(place) + a_np = np.random.uniform(-1, 1, size=[10, 1]).astype( + 'float32' + ) + 1j * np.random.uniform(-1, 1, size=[10, 1]).astype('float32') + + (b_np,) = exe.run( + base.default_main_program(), feed={"a": a_np}, fetch_list=[b] + ) + np.testing.assert_allclose(np.abs(a_np), b_np, rtol=1e-05) + @prog_scope() def test_astype(self): a = paddle.static.data(name="a", shape=[-1, 10, 1]) diff --git a/test/legacy_test/test_math_op_patch_pir.py b/test/legacy_test/test_math_op_patch_pir.py index a5e1cafeb11a8..c2e40b19ab09d 100644 --- a/test/legacy_test/test_math_op_patch_pir.py +++ b/test/legacy_test/test_math_op_patch_pir.py @@ -554,6 +554,48 @@ def test_neg(self): np.testing.assert_array_equal(res, a_np) np.testing.assert_array_equal(res, b_np) + def test_abs(self): + # test for real number + x_np = np.random.uniform(-1, 1, [10, 1024]).astype(np.float32) + res = abs(x_np) + with paddle.pir_utils.IrGuard(): + main_program, exe, program_guard = new_program() + with program_guard: + x = paddle.static.data( + name='x', shape=[10, 1024], dtype="float32" + ) + a = abs(x) + b = x.__abs__() + (a_np, b_np) = exe.run( + main_program, + feed={"x": x_np}, + fetch_list=[a, b], + ) + np.testing.assert_array_equal(res, a_np) + np.testing.assert_array_equal(res, b_np) + + def test_abs_complex(self): + # test for complex number + x_np = np.random.uniform(-1, 1, [10, 1024]).astype( + np.float32 + ) + 1j * np.random.uniform(-1, 1, [10, 1024]).astype(np.float32) + res = abs(x_np) + with paddle.pir_utils.IrGuard(): + main_program, exe, program_guard = new_program() + with program_guard: + x = paddle.static.data( + name='x', shape=[10, 1024], dtype="complex64" + ) + a = abs(x) + b = x.__abs__() + (a_np, b_np) = exe.run( + main_program, + feed={"x": x_np}, + fetch_list=[a, b], + ) + np.testing.assert_allclose(res, a_np, rtol=2e-7, atol=0.0) + np.testing.assert_allclose(res, b_np, rtol=2e-7, atol=0.0) + def test_builtin_type_conversion(self): with paddle.pir_utils.IrGuard(): _, _, program_guard = new_program() diff --git a/test/legacy_test/test_math_op_patch_var_base.py b/test/legacy_test/test_math_op_patch_var_base.py index 56df04664bc35..46645451f8ede 100644 --- a/test/legacy_test/test_math_op_patch_var_base.py +++ b/test/legacy_test/test_math_op_patch_var_base.py @@ -479,6 +479,26 @@ def test_neg(self): res = -a np.testing.assert_array_equal(res.numpy(), -a_np) + def test_abs(self): + # test for real number + a_np = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + with base.dygraph.guard(): + a = paddle.to_tensor(a_np) + res = abs(a) + np.testing.assert_array_equal(res.numpy(), np.abs(a_np)) + + def test_abs_complex(self): + # test for complex number + a_np = np.random.uniform(-1, 1, self.shape).astype( + self.dtype + ) + 1j * np.random.uniform(-1, 1, self.shape).astype(self.dtype) + with base.dygraph.guard(): + a = paddle.to_tensor(a_np) + res = abs(a) + np.testing.assert_allclose( + res.numpy(), np.abs(a_np), rtol=2e-7, atol=0.0 + ) + def test_float_int_long(self): with base.dygraph.guard(): a = paddle.to_tensor(np.array([100.1])) From b9db293106c9100e1ea00ee385c0b84f96d3a55f Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Tue, 8 Oct 2024 13:22:21 +0800 Subject: [PATCH 027/135] [XPU] disable paddlex (#68561) --- paddle/scripts/paddle_build.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8078d769ba38e..fd7bd776a8edd 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2504,13 +2504,13 @@ set +x IFS=',' read -ra DEVICES <<< "$CUDA_VISIBLE_DEVICES" echo ${DEVICES[0]} - echo "Starting to predict ResNet50 model..." - python main.py -c paddlex/configs/image_classification/ResNet50.yaml \ - -o Global.mode=predict \ - -o Predict.model_dir="./resnet50_output/best_model" \ - -o Predict.input_path="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_image_classification_001.jpg" \ - -o Global.device="xpu:${DEVICES[0]}" - echo "Predicting Resnet50 completed!" + #echo "Starting to predict ResNet50 model..." + #python main.py -c paddlex/configs/image_classification/ResNet50.yaml \ + # -o Global.mode=predict \ + # -o Predict.model_dir="./resnet50_output/best_model" \ + # -o Predict.input_path="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_image_classification_001.jpg" \ + # -o Global.device="xpu:${DEVICES[0]}" + #echo "Predicting Resnet50 completed!" cd .. export FLAGS_enable_pir_api=1 fi From 13ad9e9fb759dd76519447f927a9e9c6cd9cdeda Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 8 Oct 2024 14:47:56 +0800 Subject: [PATCH 028/135] [Inference]Add slice converter (#68482) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * split converter * 忘了提交一个单侧 * 记录代码 * add concat converter * delete unittest * rerun ci * fix bugs --------- Co-authored-by: lizexu <2694294196@qq.com> --- .../transforms/tensorrt/trt_op_marker_pass.cc | 43 +++++- python/paddle/tensorrt/converter.py | 144 +++++++++++++----- python/paddle/tensorrt/converter_utils.py | 38 +++++ python/paddle/tensorrt/impls/manipulation.py | 141 +++++++++++++++++ test/tensorrt/tensorrt_test_base.py | 32 +++- test/tensorrt/test_converter_manipulation.py | 58 +++++++ test/tensorrt/test_trt_marker_slice.py | 65 -------- 7 files changed, 407 insertions(+), 114 deletions(-) delete mode 100644 test/tensorrt/test_trt_marker_slice.py diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index 0cbef3ac80301..03ffe54971d44 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -637,21 +637,48 @@ class SliceOpPattern : public pir::OpRewritePattern { } auto axes_attr = op->attribute("axes"); - std::vector axes; for (const auto &attr : axes_attr.AsVector()) { axes.push_back(attr.dyn_cast().data()); } - pir::Value input = op.operand_source(0); - auto inputs = input.type().dyn_cast(); - auto inputs_shape = inputs.dims(); - if (axes.size() != - static_cast::size_type>(inputs_shape.size())) { - VLOG(3) << "The shape of attributes of the slice operator axes " - "and starts are not equal."; + size_t starts_size = axes.size(); + size_t ends_size = axes.size(); + if (pir::GetDefiningOpForInput(op, 1) + ->isa()) { + paddle::dialect::FullIntArrayOp full_int_array_op_start = + pir::GetDefiningOpForInput(op, 1) + ->dyn_cast(); + auto starts_attr = + full_int_array_op_start->attribute("value"); + std::vector starts; + for (const auto &attr : starts_attr.AsVector()) { + starts.push_back(attr.dyn_cast().data()); + } + starts_size = starts.size(); + } + + if (pir::GetDefiningOpForInput(op, 2) + ->isa()) { + paddle::dialect::FullIntArrayOp full_int_array_op_end = + pir::GetDefiningOpForInput(op, 2) + ->dyn_cast(); + auto ends_attr = + full_int_array_op_end->attribute("value"); + std::vector ends; + for (const auto &attr : ends_attr.AsVector()) { + ends.push_back(attr.dyn_cast().data()); + } + ends_size = ends.size(); + } + if (starts_size != axes.size() || ends_size != axes.size()) { + VLOG(3) << "The size of axes and starts are not equal. " + "Axes size: " + << axes.size() << ", Starts size: " << starts_size + << ", Ends size: " << ends_size; return false; } + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); return true; } diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 386390bf7a6a1..27dfc8f9a357f 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -80,7 +80,7 @@ def __init__(self, paddle_program, scope): # weights = trt.Weights(weight_array) param_dict.update({name: weight_array}) self.param_dict = param_dict - self.shape_map = {} + self.input_info = {} self.trt_output_value_map = {} def find_graph_inputs_outputs(self, group_op): @@ -135,6 +135,9 @@ def convert_subgraph_to_trt(self, program, group_op): min_shape_map = {} opt_shape_map = {} max_shape_map = {} + min_value_map = {} + opt_value_map = {} + max_value_map = {} input_names = [] # Because one of the inputs to pd_op.concat is builtin.combine, @@ -150,6 +153,7 @@ def convert_subgraph_to_trt(self, program, group_op): else: origin_input_value.append(value) + # create TRT Weight and TRT Input for value in origin_input_value: defining_op = value.get_defining_op() if defining_op.name() == "builtin.parameter": @@ -160,39 +164,10 @@ def convert_subgraph_to_trt(self, program, group_op): else: shape = value.shape dtype = map_dtype(value.dtype.name) - _logger.info( - f"set shape of {value}, op is: {value.get_defining_op()}" - ) - if value.get_defining_op().name() == "builtin.split": - # TODO if the input value is generated by the other trt_engine_op, so the shape is searched by origin value - min_shape = self.shape_map[value.id]["min_shape"] - opt_shape = self.shape_map[value.id]["opt_shape"] - max_shape = self.shape_map[value.id]["max_shape"] - else: - min_shape = get_value_shape_range_info( - value, False, paddle.base.core.ShapeMode.kMIN - ) - opt_shape = get_value_shape_range_info( - value, False, paddle.base.core.ShapeMode.kOPT - ) - max_shape = get_value_shape_range_info( - value, False, paddle.base.core.ShapeMode.kMAX - ) - input_name = f"input_{value.id}" - input_tensor = network.add_input( name=input_name, dtype=dtype, shape=shape ) - _logger.info(f"set min_shape of {value} as {min_shape}") - _logger.info(f"set opt_shape of {value} as {opt_shape}") - _logger.info(f"set max_shape of {value} as {max_shape}") - profile.set_shape( - input_name, min=min_shape, opt=opt_shape, max=max_shape - ) - min_shape_map[input_name] = min_shape - opt_shape_map[input_name] = opt_shape - max_shape_map[input_name] = max_shape input_names.append(input_name) value_to_trt_tensor[value.id] = input_tensor @@ -232,6 +207,79 @@ def convert_subgraph_to_trt(self, program, group_op): value_to_trt_tensor[result.id] = trt_outs[idx] else: value_to_trt_tensor[result.id] = None + + # Set TRT min/opt/max input shape and the value of shape tensor + for value in origin_input_value: + trt_input = value_to_trt_tensor[value.id] + if isinstance(trt_input, trt.Weights): + continue + input_name = trt_input.name + if input_name != "": + _logger.info( + f"set shape of {value}, op is: {value.get_defining_op()}" + ) + min_shape = [] + opt_shape = [] + max_shape = [] + min_value = [] + opt_value = [] + max_value = [] + if value.get_defining_op().name() == "builtin.split": + # TODO if the input value is generated by the other trt_engine_op, so the shape is searched by origin value + min_shape = self.input_info[value.id]["min_shape"] + opt_shape = self.input_info[value.id]["opt_shape"] + max_shape = self.input_info[value.id]["max_shape"] + if trt_input.is_shape_tensor: + min_value = self.input_info[value.id]["min_value"] + opt_value = self.input_info[value.id]["opt_value"] + max_value = self.input_info[value.id]["max_value"] + else: + min_shape = get_value_shape_range_info( + value, False, paddle.base.core.ShapeMode.kMIN + ) + opt_shape = get_value_shape_range_info( + value, False, paddle.base.core.ShapeMode.kOPT + ) + max_shape = get_value_shape_range_info( + value, False, paddle.base.core.ShapeMode.kMAX + ) + if trt_input.is_shape_tensor: + min_value = get_value_shape_range_info( + value, True, paddle.base.core.ShapeMode.kMIN + ) + opt_value = get_value_shape_range_info( + value, True, paddle.base.core.ShapeMode.kOPT + ) + max_value = get_value_shape_range_info( + value, True, paddle.base.core.ShapeMode.kMAX + ) + _logger.info(f"set min_shape of {value} as {min_shape}") + _logger.info(f"set opt_shape of {value} as {opt_shape}") + _logger.info(f"set max_shape of {value} as {max_shape}") + profile.set_shape( + input_name, min=min_shape, opt=opt_shape, max=max_shape + ) + if trt_input.is_shape_tensor: + _logger.info( + f"set min_value of shape input: {value} as {min_value}" + ) + _logger.info( + f"set max_value of shape input: {value} as {opt_value}" + ) + _logger.info( + f"set opt_value of shape input: {value} as {max_value}" + ) + profile.set_shape_input( + input_name, min=min_value, opt=opt_value, max=max_value + ) + + min_shape_map[input_name] = min_shape + opt_shape_map[input_name] = opt_shape + max_shape_map[input_name] = max_shape + min_value_map[input_name] = min_value + opt_value_map[input_name] = opt_value + max_value_map[input_name] = max_value + out_shapes = [] out_names = [] out_types = [] @@ -259,10 +307,27 @@ def convert_subgraph_to_trt(self, program, group_op): max_shape = get_value_shape_range_info( result_value, False, paddle.base.core.ShapeMode.kMAX ) - self.shape_map[result_value.id] = { + min_value = [] + opt_value = [] + max_value = [] + if output_tensor.is_shape_tensor: + min_value = get_value_shape_range_info( + result_value, True, paddle.base.core.ShapeMode.kMIN + ) + opt_value = get_value_shape_range_info( + result_value, True, paddle.base.core.ShapeMode.kOPT + ) + max_value = get_value_shape_range_info( + result_value, True, paddle.base.core.ShapeMode.kMAX + ) + + self.input_info[result_value.id] = { "min_shape": min_shape, "opt_shape": opt_shape, "max_shape": max_shape, + "min_value": min_value, + "opt_value": opt_value, + "max_value": max_value, } config = builder.create_builder_config() @@ -277,6 +342,9 @@ def convert_subgraph_to_trt(self, program, group_op): trt_params.min_input_shape = min_shape_map trt_params.max_input_shape = max_shape_map trt_params.optim_input_shape = opt_shape_map + trt_params.min_shape_tensor = min_value_map + trt_params.max_shape_tensor = max_value_map + trt_params.optim_shape_tensor = opt_value_map group_str = str(group_op) engine_name = ( int(hashlib.sha256(group_str.encode('utf-8')).hexdigest(), 16) @@ -309,13 +377,19 @@ def convert_subgraph_to_trt(self, program, group_op): continue ori_value = output_values[out_index] current_value = out[out_index] - orin_min_shape = self.shape_map[ori_value.id]["min_shape"] - orin_opt_shape = self.shape_map[ori_value.id]["opt_shape"] - orin_max_shape = self.shape_map[ori_value.id]["max_shape"] - self.shape_map[current_value.id] = { + orin_min_shape = self.input_info[ori_value.id]["min_shape"] + orin_opt_shape = self.input_info[ori_value.id]["opt_shape"] + orin_max_shape = self.input_info[ori_value.id]["max_shape"] + orin_min_value = self.input_info[ori_value.id]["min_value"] + orin_opt_value = self.input_info[ori_value.id]["opt_value"] + orin_max_value = self.input_info[ori_value.id]["max_value"] + self.input_info[current_value.id] = { "min_shape": orin_min_shape, "opt_shape": orin_opt_shape, "max_shape": orin_max_shape, + "min_value": orin_min_value, + "opt_value": orin_opt_value, + "max_value": orin_max_value, } return out diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py index 812c184f45b68..edcea52ad4bfe 100644 --- a/python/paddle/tensorrt/converter_utils.py +++ b/python/paddle/tensorrt/converter_utils.py @@ -155,3 +155,41 @@ def add_elementwise_layer(network, paddle_op, inputs, op_type): ) layer = network.add_elementwise(lhs_val, rhs_val, op_type) return layer.get_output(0) + + +# Create and add 1D constant layer +def add_1D_constant_layer(network, data, dtype=np.int32): + constant_data = np.array([data], dtype=dtype) + constant_layer = network.add_constant(constant_data.shape, constant_data) + return constant_layer.get_output(0) + + +# Get element tensor of 1D shape tensor +def get_shape_tensor_element(network, x, index): + assert index >= 0, ( + "The index should be greater or equal than 0, but got %d" % index + ) + gather_layer = network.add_gather( + input=x, indices=add_1D_constant_layer(network, index), axis=0 + ) + return gather_layer.get_output(0) + + +def trt_sum(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.SUM) + return layer.get_output(0) + + +def trt_max(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.MAX) + return layer.get_output(0) + + +def trt_sub(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.SUB) + return layer.get_output(0) + + +def trt_min(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.MIN) + return layer.get_output(0) diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py index 4d49aef7939e3..556bd48e8549c 100644 --- a/python/paddle/tensorrt/impls/manipulation.py +++ b/python/paddle/tensorrt/impls/manipulation.py @@ -13,12 +13,19 @@ # limitations under the License. +import numpy as np import tensorrt as trt from paddle.tensorrt.converter_utils import ( + add_1D_constant_layer, get_axes_for_reduce_op, get_positive_dim, + get_shape_tensor_element, has_dynamic_shape, + trt_max, + trt_min, + trt_sub, + trt_sum, ) from paddle.tensorrt.register import converter_registry @@ -205,3 +212,137 @@ def squeeze_converter(network, paddle_op, inputs): layer = network.add_shuffle(input_val) layer.reshape_dims = tuple(output_shape) return layer.get_output(0) + + +@converter_registry.register("pd_op.slice", trt_version="8.x") +def slice_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + input_shape = paddle_op.operands()[0].source().shape + axes = paddle_op.attrs()["axes"] + decrease_axis = paddle_op.attrs().get("decrease_axis") + + starts_op = paddle_op.operands()[1].source().get_defining_op() + ends_op = paddle_op.operands()[2].source().get_defining_op() + input_shape_tensor = network.add_shape(input_tensor).get_output(0) + input_rank = len(input_tensor.shape) + + starts_tensor = [] + ends_tensor = [] + for i in range(input_rank): + starts_tensor.append(add_1D_constant_layer(network, 0)) + ends_tensor.append( + get_shape_tensor_element(network, input_shape_tensor, i) + ) + + if starts_op.name() == "pd_op.full_int_array": + starts = starts_op.attrs()["value"] + assert len(starts) == len( + axes + ), "The size of this starts: %d must be equal to the axes: %d." % ( + len(starts), + len(axes), + ) + for idx in axes: + if starts[idx] < 0: + starts_tensor[axes[idx]] = trt_max( + network, + trt_sum( + network, + add_1D_constant_layer(network, starts[idx]), + get_shape_tensor_element( + network, input_shape_tensor, axes[idx] + ), + ), + add_1D_constant_layer(network, 0), + ) + else: + starts_tensor[axes[idx]] = trt_min( + network, + add_1D_constant_layer(network, starts[idx]), + get_shape_tensor_element( + network, input_shape_tensor, axes[idx] + ), + ) + else: + starts = inputs[1] + for idx in axes: + starts_tensor[axes[idx]] = get_shape_tensor_element( + network, starts, idx + ) + + if ends_op.name() == "pd_op.full_int_array": + ends = ends_op.attrs()["value"] + assert len(ends) == len( + axes + ), "The size of this ends: %d must be equal to the axes: %d." % ( + len(ends), + len(axes), + ) + for idx in axes: + if ends[idx] < 0: + ends_tensor[axes[idx]] = trt_max( + network, + trt_sum( + network, + add_1D_constant_layer(network, ends[idx]), + get_shape_tensor_element( + network, input_shape_tensor, axes[idx] + ), + ), + add_1D_constant_layer(network, 0), + ) + else: + ends_tensor[axes[idx]] = trt_min( + network, + add_1D_constant_layer(network, ends[idx]), + get_shape_tensor_element( + network, input_shape_tensor, axes[idx] + ), + ) + else: + ends = inputs[2] + for idx in axes: + ends_tensor[axes[idx]] = get_shape_tensor_element( + network, ends, idx + ) + + start_tensor_layer = network.add_concatenation(starts_tensor) + start_tensor_layer.axis = 0 + start_tensor = start_tensor_layer.get_output(0) + end_tensor_layer = network.add_concatenation(ends_tensor) + end_tensor_layer.axis = 0 + end_tensor = end_tensor_layer.get_output(0) + size_tensor = trt_sub(network, end_tensor, start_tensor) + + # Create Slice layer + slice_layer = network.add_slice( + input_tensor, [0] * input_rank, [0] * input_rank, [1] * input_rank + ) + slice_layer.set_input(1, start_tensor) + slice_layer.set_input(2, size_tensor) + + output_tensor = slice_layer.get_output(0) + + # Handle decrease_axis + if decrease_axis: + output_shape = network.add_shape(output_tensor).get_output(0) + new_shape_dims = [] + for i in range(output_shape.shape[0]): + if i not in decrease_axis: + dim = network.add_slice(output_shape, [i], [1], [1]).get_output( + 0 + ) + new_shape_dims.append(dim) + if len(new_shape_dims) == 0: + new_shape_tensor = network.add_constant( + [1], np.array([1], dtype=np.int32) + ) + else: + new_shape_tensor = network.add_concatenation(new_shape_dims) + new_shape_tensor.axis = 0 + + reshape_layer = network.add_shuffle(output_tensor) + reshape_layer.set_input(1, new_shape_tensor.get_output(0)) + output_tensor = reshape_layer.get_output(0) + + return output_tensor diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py index 4a39429719622..1878ef63d9c4e 100755 --- a/test/tensorrt/tensorrt_test_base.py +++ b/test/tensorrt/tensorrt_test_base.py @@ -67,15 +67,27 @@ def create_fake_program(self): new_list_args.append(input_data) api_args[feed_name] = new_list_args else: - input_shape_without_dynamic_dim = self.api_args[ - feed_name - ].shape[1:] - input_dynamic_shape = [-1] - input_dynamic_shape.extend(input_shape_without_dynamic_dim) + empty_min_max_shape = ( + self.min_shape is None or self.max_shape is None + ) + if ( + not empty_min_max_shape + and feed_name in self.min_shape.keys() + and feed_name in self.max_shape.keys() + ): + # dynamic shape condition + input_shape_without_dynamic_dim = self.api_args[ + feed_name + ].shape[1:] + input_shape = [-1] + input_shape.extend(input_shape_without_dynamic_dim) + else: + input_shape = self.api_args[feed_name].shape + input_dtype = self.api_args[feed_name].dtype input_data = paddle.static.data( name=feed_name, - shape=input_dynamic_shape, + shape=input_shape, dtype=input_dtype, ) api_args[feed_name] = input_data @@ -144,6 +156,14 @@ def check_trt_result(self, rtol=1e-5, atol=1e-5): min_shape_data = dict() # noqa: C408 max_shape_data = dict() # noqa: C408 for feed_name in self.program_config["feed_list"]: + if ( + feed_name not in self.min_shape.keys() + and feed_name not in self.max_shape.keys() + ): + min_shape_data[feed_name] = self.api_args[feed_name] + max_shape_data[feed_name] = self.api_args[feed_name] + continue + if isinstance(self.api_args[feed_name], dict): for i in range(len(self.min_shape[feed_name])): sub_feed_name = feed_name + str(i) diff --git a/test/tensorrt/test_converter_manipulation.py b/test/tensorrt/test_converter_manipulation.py index 15442eca54dc2..638e9de6b2802 100644 --- a/test/tensorrt/test_converter_manipulation.py +++ b/test/tensorrt/test_converter_manipulation.py @@ -18,6 +18,7 @@ from tensorrt_test_base import TensorRTBaseTest import paddle +from paddle import _C_ops class TestConcatTRTPattern(TensorRTBaseTest): @@ -55,5 +56,62 @@ def test_trt_result(self): self.check_trt_result() +def slice_api(x, axes, starts, ends, infer_flags, decrease_axis): + return _C_ops.slice(x, axes, starts, ends, infer_flags, decrease_axis) + + +class TestSliceWithDecreaseAxisTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = slice_api + self.api_args = { + "x": np.random.random([6, 6, 64, 64]).astype("float32"), + "axes": [0, 1], + "starts": [0, 1], + "ends": [2, 2], + "infer_flags": [1, 1], + "decrease_axis": [1], + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 6, 64, 64]} + self.max_shape = {"x": [8, 6, 64, 64]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSliceTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.slice + self.api_args = { + "x": np.random.random([6, 6, 64, 64]).astype("float32"), + "axes": [0, 1], + "starts": [-2, -3], + "ends": [-1, -1], + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 6, 64, 64]} + self.max_shape = {"x": [8, 6, 64, 64]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSliceWithInputStartTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.slice + self.api_args = { + "x": np.random.random([5, 4, 5, 6]).astype("float32"), + "axes": [0, 1, 2], + "starts": np.array([1, 0, 2]).astype("int32"), + "ends": np.array([3, 3, 4]).astype("int32"), + } + self.program_config = {"feed_list": ["x", "starts", "ends"]} + self.min_shape = {"x": [3, 4, 5, 6]} + self.max_shape = {"x": [6, 4, 5, 6]} + + def test_trt_result(self): + self.check_trt_result() + + if __name__ == '__main__': unittest.main() diff --git a/test/tensorrt/test_trt_marker_slice.py b/test/tensorrt/test_trt_marker_slice.py deleted file mode 100644 index fbc1d952281ef..0000000000000 --- a/test/tensorrt/test_trt_marker_slice.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from pass_test import PassTest - -import paddle -from paddle.base import core - - -class TestSliceTRTPattern(PassTest): - def is_program_valid(self, program=None): - return True - - def sample_program(self): - with paddle.pir_utils.IrGuard(): - main_prog = paddle.static.Program() - start_prog = paddle.static.Program() - with paddle.pir.core.program_guard(main_prog, start_prog): - x = paddle.static.data( - name='x', shape=[4, 5, 6], dtype='float32' - ) - - # Convert starts and ends to tensors - axes = [0, 1, 2] - starts = [-3, 0, 2] - ends = [3, 2, 4] - - sliced_1 = paddle.slice(x, axes=axes, starts=starts, ends=ends) - - out = paddle.assign(sliced_1) - self.pass_attr_list = [{'trt_op_marker_pass': {}}] - self.feeds = { - "x": np.random.random([4, 5, 6]).astype("float32"), - } - self.fetch_list = [out] - self.valid_op_map = { - "pd_op.conv2d": 0, - } - yield [main_prog, start_prog], False - - def setUp(self): - if core.is_compiled_with_cuda(): - self.places.append(paddle.CUDAPlace(0)) - self.trt_expected_ops = {"pd_op.slice"} - - def test_check_output(self): - self.check_pass_correct() - - -if __name__ == "__main__": - unittest.main() From 5a70f5646390d468cd3a76e229fb519d9665ef51 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Tue, 8 Oct 2024 15:01:37 +0800 Subject: [PATCH 029/135] Eliminating redundancy memcpy for shadow feed (#68325) * Eliminating redundancy memcpy for shadow feed * Update code * Support ostream<< for operation * Update code * Update code * Update code * Update code * Update code * Update code * Update code * Update code * Update code * Update code * Update code * Fix iterator bug * Fit remove_shadow_feed_pass --- .../hlir/dialect/operator/ir/manual_op.cc | 16 ++--- .../hlir/dialect/operator/ir/op_dialect.cc | 14 ++--- .../hlir/dialect/operator/ir/op_dialect.h | 3 +- .../hlir/framework/pir/op_lowering_group.cc | 2 +- paddle/cinn/operator_fusion/pattern_node.h | 2 +- paddle/cinn/operator_fusion/utils.h | 2 +- .../dialect/distributed/ir/dist_dialect.cc | 2 +- .../pir/dialect/distributed/ir/dist_dialect.h | 3 +- .../pir/dialect/kernel/ir/kernel_dialect.cc | 48 ++++++++------- .../pir/dialect/kernel/ir/kernel_dialect.h | 9 ++- .../pir/dialect/operator/interface/decomp.h | 2 +- .../dialect/operator/interface/decomp_vjp.h | 2 +- .../interface/get_kernel_type_for_var.h | 2 +- .../dialect/operator/interface/infermeta.h | 2 +- .../interface/layout_transformation.h | 2 +- .../dialect/operator/interface/op_yaml_info.h | 2 +- .../operator/interface/parse_kernel_key.h | 2 +- .../pir/dialect/operator/interface/vjp.h | 2 +- .../dialect/operator/ir/control_flow_op.cc | 16 ++--- .../dialect/operator/ir/manual_pylayer_op.cc | 10 ++-- .../pir/dialect/operator/ir/op_dialect.cc | 14 ++--- .../pir/dialect/operator/ir/op_dialect.h | 6 +- .../dialect/operator/ir/op_onednn_dialect.cc | 21 +++---- .../dialect/operator/ir/op_onednn_dialect.h | 2 +- .../pir/dialect/operator/trait/custom_vjp.h | 2 +- .../pir/dialect/operator/trait/forward_only.h | 2 +- .../pir/dialect/operator/trait/inplace.h | 2 +- .../fluid/pir/dialect/operator/trait/onednn.h | 6 +- .../fluid/pir/transforms/build_cinn_pass.cc | 2 +- .../general/remove_shadow_feed_pass.cc | 13 +++- .../pir/transforms/pd_op_to_kernel_pass.cc | 57 +++++++++++++++++- .../pir/transforms/sub_graph_detector.cc | 2 +- paddle/phi/kernels/data_kernel.h | 2 + paddle/phi/kernels/impl/data_impl.h | 35 +++++++++-- .../phi/ops/yaml/inconsistent/static_ops.yaml | 8 +-- paddle/pir/include/core/block.h | 2 + paddle/pir/include/core/builtin_op.h | 4 +- paddle/pir/include/core/dialect.h | 4 +- paddle/pir/include/core/ir_printer.h | 18 +++--- paddle/pir/include/core/op_base.h | 13 ++-- paddle/pir/include/core/op_trait.h | 12 ++-- paddle/pir/include/core/operation.h | 10 ++-- .../dialect/control_flow/ir/cf_dialect.h | 2 +- .../dialect/control_flow/ir/cf_interface.h | 2 +- .../cache_grad_op_symbolic_shape.h | 2 +- .../infer_symbolic_shape.h | 2 +- paddle/pir/src/core/builtin_op.cc | 10 ++-- paddle/pir/src/core/dialect.cc | 2 +- paddle/pir/src/core/ir_printer.cc | 60 ++++++++++--------- paddle/pir/src/core/op_trait.cc | 14 ++--- paddle/pir/src/core/operation.cc | 6 ++ .../src/dialect/control_flow/ir/cf_dialect.cc | 9 +-- .../pir/src/dialect/control_flow/ir/cf_op.cc | 2 +- .../transforms/shape_optimization_pass.cc | 6 +- .../src/dialect/shape/utils/shape_analysis.cc | 12 ++-- test/cpp/pir/core/ir_op_test.cc | 4 +- test/cpp/pir/tools/test1_dialect.cc | 6 +- test/cpp/pir/tools/test1_dialect.h | 2 +- test/cpp/pir/tools/test_dialect.cc | 6 +- test/cpp/pir/tools/test_dialect.h | 2 +- test/cpp/pir/tools/test_trait.h | 4 +- 61 files changed, 325 insertions(+), 208 deletions(-) diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc index 5880dcb5e6633..61362d14da399 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc +++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc @@ -103,15 +103,15 @@ void GroupOp::VerifySig() {} void GroupOp::Print(pir::IrPrinter& printer) { auto& os = printer.os; auto op = operation(); - printer.PrintOpResult(op); + printer.PrintOpResult(*op); os << " = \"" << name() << "\" [id:" << op->id() << "]"; - printer.PrintOpOperands(op); + printer.PrintOpOperands(*op); os << " -> "; - printer.PrintOpReturnType(op); + printer.PrintOpReturnType(*op); os << " {\n"; printer.AddIndentation(); for (auto& sub_op : GetOperators()) { - printer.PrintOperation(sub_op); + printer.PrintOperation(*sub_op); os << "\n"; } printer.DecreaseIndentation(); @@ -187,15 +187,15 @@ void FusionOp::VerifySig() {} void FusionOp::Print(pir::IrPrinter& printer) { auto& os = printer.os; auto op = operation(); - printer.PrintOpResult(op); + printer.PrintOpResult(*op); os << " = \"" << name() << "\" [id:" << op->id() << "]"; - printer.PrintOpOperands(op); + printer.PrintOpOperands(*op); os << " -> "; - printer.PrintOpReturnType(op); + printer.PrintOpReturnType(*op); os << " {\n"; printer.AddIndentation(); for (auto& sub_op : GetOperators()) { - printer.PrintOperation(sub_op); + printer.PrintOperation(*sub_op); os << "\n"; } printer.DecreaseIndentation(); diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc index aaf277d92bbb2..a6eb7805b212d 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc +++ b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc @@ -91,15 +91,15 @@ void OperatorDialect::PrintAttribute(pir::Attribute attr, } } -pir::OpPrintFn OperatorDialect::PrintOperation(pir::Operation *op) const { - if (auto group_op = op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { - auto group_op = op->dyn_cast(); +pir::OpPrintFn OperatorDialect::PrintOperation(const pir::Operation &op) const { + if (auto group_op = op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { + auto group_op = op.dyn_cast(); group_op.Print(printer); }; - } else if (auto fusion_op = op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { - auto fusion_op = op->dyn_cast(); + } else if (auto fusion_op = op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { + auto fusion_op = op.dyn_cast(); fusion_op.Print(printer); }; } diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.h b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.h index e68fd03117023..39d88f52ae356 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.h +++ b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.h @@ -27,7 +27,8 @@ class OperatorDialect : public ::pir::Dialect { void PrintType(pir::Type type, std::ostream& os) const override; void PrintAttribute(pir::Attribute type, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT private: void initialize(); diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc index cc185f57d2615..033af89d01ae4 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc @@ -170,7 +170,7 @@ std::ostream& operator<<(std::ostream& os, const OpLoweringGroup& group) { os << "Group id: " << group.group_id() << ", func_name: " << group.FuncName() << "\n"; for (auto* op : group.ops()) { - printer.PrintOperation(op); + printer.PrintOperation(*op); PrintSymbolDims(*op); os << "\n"; } diff --git a/paddle/cinn/operator_fusion/pattern_node.h b/paddle/cinn/operator_fusion/pattern_node.h index 48fe76e9d1b19..6aa0ed68c10d3 100644 --- a/paddle/cinn/operator_fusion/pattern_node.h +++ b/paddle/cinn/operator_fusion/pattern_node.h @@ -66,7 +66,7 @@ struct PatternNode { ss << "\n anchor: "; auto anchor_op = std::get(stmt_pattern_).anchor().defining_op(); - printer.PrintOperation(const_cast(anchor_op)); + printer.PrintOperation(*anchor_op); } ss << "\nOps in pattern:" << std::endl; ss << OpsDebugStr(GetOpsInPattern(this->stmt_pattern())); diff --git a/paddle/cinn/operator_fusion/utils.h b/paddle/cinn/operator_fusion/utils.h index 2c125aa9e9b4f..f6e81d9028c40 100644 --- a/paddle/cinn/operator_fusion/utils.h +++ b/paddle/cinn/operator_fusion/utils.h @@ -89,7 +89,7 @@ static std::string OpsDebugStr(std::vector ops) { std::stringstream ss; pir::IrPrinter printer(ss); for (const auto* op : ops) { - printer.PrintOperation(const_cast(op)); + printer.PrintOperation(*op); ss << "(" << op << ")" << "\n"; } diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc index f3add0ab70a8a..9b858b2fe2c84 100644 --- a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc +++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc @@ -120,7 +120,7 @@ void DistDialect::PrintAttribute(pir::Attribute attr, std::ostream &os) const { } } -pir::OpPrintFn DistDialect::PrintOperation(pir::Operation *op) const { +pir::OpPrintFn DistDialect::PrintOperation(const pir::Operation &op) const { return nullptr; } diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h index 2a7420b0a495a..6c4c5ea60995a 100644 --- a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h +++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h @@ -29,7 +29,8 @@ class DistDialect : public pir::Dialect { void PrintAttribute(pir::Attribute attr, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT private: void initialize(); diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc index a52fb2125a41a..e9af3a035a4af 100644 --- a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc +++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc @@ -112,16 +112,16 @@ void KernelDialect::PrintAttribute(pir::Attribute attr, PrintKernelAttribute(attr, os); } -pir::OpPrintFn KernelDialect::PrintOperation(pir::Operation *op) const { - if (op->dyn_cast() || op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { +pir::OpPrintFn KernelDialect::PrintOperation(const pir::Operation &op) const { + if (op.dyn_cast() || op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { auto &os = printer.os; printer.PrintOpResult(op); os << " ="; - if (auto phi_kernel_op = op->dyn_cast()) { + if (auto phi_kernel_op = op.dyn_cast()) { std::string kernel_name = phi_kernel_op.kernel_name(); - if (op->attributes().count("is_inplace") != 0 && - op->attributes() + if (op.attributes().count("is_inplace") != 0 && + op.attributes() .at("is_inplace") .dyn_cast() .data()) { @@ -129,10 +129,10 @@ pir::OpPrintFn KernelDialect::PrintOperation(pir::Operation *op) const { } os << " \"" << kernel_name << "(phi_kernel)\""; } else { - auto legacy_kernel_op = op->dyn_cast(); + auto legacy_kernel_op = op.dyn_cast(); std::string kernel_name = legacy_kernel_op.kernel_name(); - if (op->attributes().count("is_inplace") != 0 && - op->attributes() + if (op.attributes().count("is_inplace") != 0 && + op.attributes() .at("is_inplace") .dyn_cast() .data()) { @@ -169,15 +169,16 @@ void CustomKernelDialect::PrintAttribute(pir::Attribute attr, PrintKernelAttribute(attr, os); } -pir::OpPrintFn CustomKernelDialect::PrintOperation(pir::Operation *op) const { - return [](pir::Operation *op, pir::IrPrinter &printer) { +pir::OpPrintFn CustomKernelDialect::PrintOperation( + const pir::Operation &op) const { + return [](const pir::Operation &op, pir::IrPrinter &printer) { auto &os = printer.os; printer.PrintOpResult(op); os << " ="; - auto custom_kernel_op = op->dyn_cast(); + auto custom_kernel_op = op.dyn_cast(); std::string kernel_name = custom_kernel_op.kernel_name(); - if (op->attributes().count("is_inplace") != 0 && - op->attributes() + if (op.attributes().count("is_inplace") != 0 && + op.attributes() .at("is_inplace") .dyn_cast() .data()) { @@ -213,16 +214,17 @@ void OneDNNKernelDialect::PrintAttribute(pir::Attribute attr, PrintKernelAttribute(attr, os); } -pir::OpPrintFn OneDNNKernelDialect::PrintOperation(pir::Operation *op) const { - if (op->dyn_cast() || op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { +pir::OpPrintFn OneDNNKernelDialect::PrintOperation( + const pir::Operation &op) const { + if (op.dyn_cast() || op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { auto &os = printer.os; printer.PrintOpResult(op); os << " ="; - if (auto phi_kernel_op = op->dyn_cast()) { + if (auto phi_kernel_op = op.dyn_cast()) { std::string kernel_name = phi_kernel_op.kernel_name(); - if (op->attributes().count("is_inplace") != 0 && - op->attributes() + if (op.attributes().count("is_inplace") != 0 && + op.attributes() .at("is_inplace") .dyn_cast() .data()) { @@ -230,10 +232,10 @@ pir::OpPrintFn OneDNNKernelDialect::PrintOperation(pir::Operation *op) const { } os << " \"" << kernel_name << "(phi_kernel)\""; } else { - auto legacy_kernel_op = op->dyn_cast(); + auto legacy_kernel_op = op.dyn_cast(); std::string kernel_name = legacy_kernel_op.kernel_name(); - if (op->attributes().count("is_inplace") != 0 && - op->attributes() + if (op.attributes().count("is_inplace") != 0 && + op.attributes() .at("is_inplace") .dyn_cast() .data()) { diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h index ab6ec783b4909..128a8490c93d4 100644 --- a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h +++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h @@ -29,7 +29,8 @@ class KernelDialect : public pir::Dialect { void PrintAttribute(pir::Attribute attr, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT private: void initialize(); @@ -45,7 +46,8 @@ class CustomKernelDialect : public pir::Dialect { void PrintAttribute(pir::Attribute attr, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT private: void initialize(); @@ -62,7 +64,8 @@ class OneDNNKernelDialect : public pir::Dialect { void PrintAttribute(pir::Attribute attr, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT private: void initialize(); diff --git a/paddle/fluid/pir/dialect/operator/interface/decomp.h b/paddle/fluid/pir/dialect/operator/interface/decomp.h index 48697de7e28b3..414d51945cd5b 100644 --- a/paddle/fluid/pir/dialect/operator/interface/decomp.h +++ b/paddle/fluid/pir/dialect/operator/interface/decomp.h @@ -35,7 +35,7 @@ class DecompInterface : public pir::OpInterfaceBase { }; /// Constructor - DecompInterface(pir::Operation* op, Concept* impl) + DecompInterface(const pir::Operation* op, Concept* impl) : pir::OpInterfaceBase(op), impl_(impl) {} std::vector> Decomp(pir::Operation* op) { diff --git a/paddle/fluid/pir/dialect/operator/interface/decomp_vjp.h b/paddle/fluid/pir/dialect/operator/interface/decomp_vjp.h index bc8c8254df6b5..13b4c37683264 100644 --- a/paddle/fluid/pir/dialect/operator/interface/decomp_vjp.h +++ b/paddle/fluid/pir/dialect/operator/interface/decomp_vjp.h @@ -35,7 +35,7 @@ class DecompVjpInterface : public pir::OpInterfaceBase { }; /// Constructor - DecompVjpInterface(pir::Operation* op, Concept* impl) + DecompVjpInterface(const pir::Operation* op, Concept* impl) : pir::OpInterfaceBase(op), impl_(impl) {} std::vector> DecompVjp(pir::Operation* op) { diff --git a/paddle/fluid/pir/dialect/operator/interface/get_kernel_type_for_var.h b/paddle/fluid/pir/dialect/operator/interface/get_kernel_type_for_var.h index 77746326a00b7..df16f44a04234 100644 --- a/paddle/fluid/pir/dialect/operator/interface/get_kernel_type_for_var.h +++ b/paddle/fluid/pir/dialect/operator/interface/get_kernel_type_for_var.h @@ -48,7 +48,7 @@ class GetKernelTypeForVarInterface }; /// Constructor - GetKernelTypeForVarInterface(pir::Operation* op, Concept* impl) + GetKernelTypeForVarInterface(const pir::Operation* op, Concept* impl) : pir::OpInterfaceBase(op), impl_(impl) {} phi::DataType GetKernelTypeForVar( diff --git a/paddle/fluid/pir/dialect/operator/interface/infermeta.h b/paddle/fluid/pir/dialect/operator/interface/infermeta.h index d5197af5be94f..8ed3405487779 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infermeta.h +++ b/paddle/fluid/pir/dialect/operator/interface/infermeta.h @@ -47,7 +47,7 @@ class InferMetaInterface : public pir::OpInterfaceBase { }; /// Constructor - InferMetaInterface(pir::Operation *op, Concept *impl) + InferMetaInterface(const pir::Operation *op, Concept *impl) : pir::OpInterfaceBase(op), impl_(impl) {} void InferMeta(phi::InferMetaContext *infer_meta) { diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h index 52ed9c6c289e7..828ebd449b0de 100644 --- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h +++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.h @@ -86,7 +86,7 @@ class LayoutTransformationInterface CanBeModifiedModel) {} }; - LayoutTransformationInterface(pir::Operation* op, Concept* impl) + LayoutTransformationInterface(const pir::Operation* op, Concept* impl) : pir::OpInterfaceBase(op), impl_(impl) {} common::DataLayout PreferLayout(pir::Operation* op) { diff --git a/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h b/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h index 6da9c41f71496..b02c9de357f31 100644 --- a/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h +++ b/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h @@ -43,7 +43,7 @@ class OpYamlInfoInterface : public pir::OpInterfaceBase { }; /// Constructor - OpYamlInfoInterface(pir::Operation* op, Concept* impl) + OpYamlInfoInterface(const pir::Operation* op, Concept* impl) : pir::OpInterfaceBase(op), impl_(impl) {} OpInfoTuple GetOpInfo() { return impl_->get_op_info_(operation_->name()); } diff --git a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h index 769c12aac117e..c73c85a914269 100644 --- a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h +++ b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h @@ -43,7 +43,7 @@ class ParseKernelKeyInterface }; /// Constructor - ParseKernelKeyInterface(pir::Operation *op, Concept *impl) + ParseKernelKeyInterface(const pir::Operation *op, Concept *impl) : pir::OpInterfaceBase(op), impl_(impl) {} KernelKeyTuple ParseKernelKey(pir::Operation *op) { diff --git a/paddle/fluid/pir/dialect/operator/interface/vjp.h b/paddle/fluid/pir/dialect/operator/interface/vjp.h index a59e4389ecbaa..3150d4c2c5f2b 100644 --- a/paddle/fluid/pir/dialect/operator/interface/vjp.h +++ b/paddle/fluid/pir/dialect/operator/interface/vjp.h @@ -50,7 +50,7 @@ class VjpInterface : public pir::OpInterfaceBase { }; /// Constructor - VjpInterface(pir::Operation* op, Concept* impl) + VjpInterface(const pir::Operation* op, Concept* impl) : pir::OpInterfaceBase(op), impl_(impl) {} std::vector> Vjp( diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc index a0a45f827a50c..7180345cd72be 100644 --- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc +++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc @@ -168,28 +168,28 @@ pir::Block &IfOp::false_block() { void IfOp::Print(pir::IrPrinter &printer) { auto &os = printer.os; auto op = operation(); - printer.PrintOpResult(op); + printer.PrintOpResult(*op); os << " = \"" << name() << "\""; if (VLOG_IS_ON(1) || FLAGS_pir_debug) { os << " [id:" << op->id() << "]"; } - printer.PrintOpOperands(op); - printer.PrintAttributeMap(op); + printer.PrintOpOperands(*op); + printer.PrintAttributeMap(*op); os << " -> "; - printer.PrintOpReturnType(op); + printer.PrintOpReturnType(*op); os << " {\n"; printer.AddIndentation(); for (auto &item : true_block()) { - printer.PrintOperation(&item); + printer.PrintOperation(item); os << "\n"; } printer.DecreaseIndentation(); os << printer.indentation() << "} else {\n"; printer.AddIndentation(); for (auto &item : false_block()) { - printer.PrintOperation(&item); + printer.PrintOperation(item); os << "\n"; } printer.DecreaseIndentation(); @@ -426,7 +426,7 @@ pir::Value WhileOp::cond() { return (*this)->operand_source(0); } void WhileOp::Print(pir::IrPrinter &printer) { auto &os = printer.os; auto op = operation(); - printer.PrintOpResult(op); + printer.PrintOpResult(*op); os << " = \"" << name() << "\""; if (VLOG_IS_ON(1) || FLAGS_pir_debug) { os << " [id:" << op->id() << "]"; @@ -450,7 +450,7 @@ void WhileOp::Print(pir::IrPrinter &printer) { os << "\n"; printer.AddIndentation(); for (auto &item : body()) { - printer.PrintOperation(&item); + printer.PrintOperation(item); os << "\n"; } printer.DecreaseIndentation(); diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_pylayer_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_pylayer_op.cc index 27414cd243538..29c7f87918598 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_pylayer_op.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_pylayer_op.cc @@ -118,21 +118,21 @@ pir::Block &PyLayerOp::forward_block() { void PyLayerOp::Print(pir::IrPrinter &printer) { auto &os = printer.os; auto op = operation(); - printer.PrintOpResult(op); + printer.PrintOpResult(*op); os << " = pd_op.pylayer"; if (VLOG_IS_ON(1) || FLAGS_pir_debug) { os << " [id:" << op->id() << "]"; } - printer.PrintOpOperands(op); - printer.PrintAttributeMap(op); + printer.PrintOpOperands(*op); + printer.PrintAttributeMap(*op); os << " -> "; - printer.PrintOpReturnType(op); + printer.PrintOpReturnType(*op); os << "{"; for (auto &item : forward_block()) { os << "\n "; - printer.PrintOperation(&item); + printer.PrintOperation(item); } os << "\n }"; } diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc index ce14cdbe9c793..0cd4f7d9ba980 100644 --- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc +++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc @@ -326,13 +326,13 @@ void PrintAttributeImpl(pir::Attribute attr, std::ostream& os) { } } -void PrintOperationImpl(pir::Operation* op, +void PrintOperationImpl(const pir::Operation& op, pir::IrPrinter& printer) { // NOLINT - if (auto if_op = op->dyn_cast()) { + if (auto if_op = op.dyn_cast()) { if_op.Print(printer); - } else if (auto while_op = op->dyn_cast()) { + } else if (auto while_op = op.dyn_cast()) { while_op.Print(printer); - } else if (auto pylayer_op = op->dyn_cast()) { + } else if (auto pylayer_op = op.dyn_cast()) { pylayer_op.Print(printer); } else { printer.PrintGeneralOperation(op); @@ -431,8 +431,8 @@ pir::Attribute OperatorDialect::ParseAttribute( } } -pir::OpPrintFn OperatorDialect::PrintOperation(pir::Operation* op) const { - if (op->isa() || op->isa() || op->isa()) { +pir::OpPrintFn OperatorDialect::PrintOperation(const pir::Operation& op) const { + if (op.isa() || op.isa() || op.isa()) { return PrintOperationImpl; } return nullptr; @@ -1074,7 +1074,7 @@ void CustomOpDialect::PrintAttribute(pir::Attribute attr, PrintAttributeImpl(attr, os); } -pir::OpPrintFn CustomOpDialect::PrintOperation(pir::Operation* op) const { +pir::OpPrintFn CustomOpDialect::PrintOperation(const pir::Operation& op) const { return nullptr; } diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.h b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h index df47461d85607..4571857c65937 100644 --- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.h +++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h @@ -34,7 +34,8 @@ class TEST_API OperatorDialect : public pir::Dialect { void PrintType(pir::Type type, std::ostream& os) const override; void PrintAttribute(pir::Attribute attr, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT private: void initialize(); @@ -59,7 +60,8 @@ class CustomOpDialect : public pir::Dialect { void PrintType(pir::Type type, std::ostream& os) const override; void PrintAttribute(pir::Attribute type, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; // NOLINT + pir::OpPrintFn PrintOperation( + const pir::Operation& op) const override; // NOLINT void RegisterCustomOp(const paddle::OpMetaInfo& op_meta); diff --git a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc index ca361db706a6e..bc1a3701be614 100644 --- a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc +++ b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc @@ -129,20 +129,21 @@ pir::Attribute OneDNNOperatorDialect::ParseAttribute( } } -pir::OpPrintFn OneDNNOperatorDialect::PrintOperation(pir::Operation *op) const { - if (auto if_op = op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { - auto if_op = op->dyn_cast(); +pir::OpPrintFn OneDNNOperatorDialect::PrintOperation( + const pir::Operation &op) const { + if (auto if_op = op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { + auto if_op = op.dyn_cast(); if_op.Print(printer); }; - } else if (auto pylayer_op = op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { - auto pylayer_op = op->dyn_cast(); + } else if (auto pylayer_op = op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { + auto pylayer_op = op.dyn_cast(); pylayer_op.Print(printer); }; - } else if (auto while_op = op->dyn_cast()) { - return [](pir::Operation *op, pir::IrPrinter &printer) { - auto while_op = op->dyn_cast(); + } else if (auto while_op = op.dyn_cast()) { + return [](const pir::Operation &op, pir::IrPrinter &printer) { + auto while_op = op.dyn_cast(); while_op.Print(printer); }; } diff --git a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h index 6ef33672c9c96..75cc2022e978b 100644 --- a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h +++ b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h @@ -30,7 +30,7 @@ class OneDNNOperatorDialect : public pir::Dialect { void PrintType(pir::Type type, std::ostream& os) const override; void PrintAttribute(pir::Attribute type, std::ostream& os) const override; - pir::OpPrintFn PrintOperation(pir::Operation* op) const override; + pir::OpPrintFn PrintOperation(const pir::Operation& op) const override; private: void initialize(); diff --git a/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h b/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h index 1b6585c498cc2..5fe27e91a2448 100644 --- a/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h +++ b/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h @@ -28,7 +28,7 @@ namespace paddle { namespace dialect { class CustomVjpTrait : public pir::OpTraitBase { public: - explicit CustomVjpTrait(pir::Operation *op) + explicit CustomVjpTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; diff --git a/paddle/fluid/pir/dialect/operator/trait/forward_only.h b/paddle/fluid/pir/dialect/operator/trait/forward_only.h index 72b869b903f51..05ca558e8f4f3 100644 --- a/paddle/fluid/pir/dialect/operator/trait/forward_only.h +++ b/paddle/fluid/pir/dialect/operator/trait/forward_only.h @@ -20,7 +20,7 @@ namespace paddle { namespace dialect { class ForwardOnlyTrait : public pir::OpTraitBase { public: - explicit ForwardOnlyTrait(pir::Operation *op) + explicit ForwardOnlyTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; diff --git a/paddle/fluid/pir/dialect/operator/trait/inplace.h b/paddle/fluid/pir/dialect/operator/trait/inplace.h index 9d214cece5e05..5b602e5047caa 100644 --- a/paddle/fluid/pir/dialect/operator/trait/inplace.h +++ b/paddle/fluid/pir/dialect/operator/trait/inplace.h @@ -20,7 +20,7 @@ namespace paddle { namespace dialect { class InplaceTrait : public pir::OpTraitBase { public: - explicit InplaceTrait(pir::Operation *op) + explicit InplaceTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; diff --git a/paddle/fluid/pir/dialect/operator/trait/onednn.h b/paddle/fluid/pir/dialect/operator/trait/onednn.h index 51bda4697fe5d..a2a935f0aa66b 100644 --- a/paddle/fluid/pir/dialect/operator/trait/onednn.h +++ b/paddle/fluid/pir/dialect/operator/trait/onednn.h @@ -22,20 +22,20 @@ namespace paddle { namespace dialect { class OneDNNTrait : public pir::OpTraitBase { public: - explicit OneDNNTrait(pir::Operation *op) + explicit OneDNNTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; class OneDNNOnlyTrait : public pir::OpTraitBase { public: - explicit OneDNNOnlyTrait(pir::Operation *op) + explicit OneDNNOnlyTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; class OneDNNDynamicFallbackTrait : public pir::OpTraitBase { public: - explicit OneDNNDynamicFallbackTrait(pir::Operation *op) + explicit OneDNNDynamicFallbackTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; diff --git a/paddle/fluid/pir/transforms/build_cinn_pass.cc b/paddle/fluid/pir/transforms/build_cinn_pass.cc index bc14876ceed55..632c8785a240b 100644 --- a/paddle/fluid/pir/transforms/build_cinn_pass.cc +++ b/paddle/fluid/pir/transforms/build_cinn_pass.cc @@ -72,7 +72,7 @@ static std::string OpsDebugStr(std::vector ops) { std::stringstream ss; pir::IrPrinter printer(ss); for (const auto* op : ops) { - printer.PrintOperation(const_cast(op)); + printer.PrintOperation(*op); ss << "{" << op->id() << "}\n"; } return ss.str(); diff --git a/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc b/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc index d6b81f6f34844..fd8d4aea8c2d0 100644 --- a/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc +++ b/paddle/fluid/pir/transforms/general/remove_shadow_feed_pass.cc @@ -86,7 +86,7 @@ class RemoveShadowFeedPattern if (!var) { return false; } - phi::Place var_place; + phi::Place var_place, dst_place; if (var->IsType()) { var_place = GetVarPlace(var, place_); } else if (var->IsType()) { @@ -99,7 +99,16 @@ class RemoveShadowFeedPattern "RemoveShadowFeedPattern only support output " "variable of type DenseTensor, SelectedRows or VariableRefArray")); } - return var_place == place_; + + int dst_place_type = + op.attribute("dst_place_type").dyn_cast().data(); + if (dst_place_type == 0) { + dst_place = phi::CPUPlace(); + } else { + dst_place = place_; + } + + return var_place == dst_place; } return false; } diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 9b9ee516c41d6..8047630858f54 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -1664,7 +1664,8 @@ void AddShadowFeedForValue( std::unordered_map attr_map{ {"op_name", pir::StrAttribute::get(ctx, "pd_op.shadow_feed")}, {"kernel_name", pir::StrAttribute::get(ctx, "shadow_feed")}, - {"kernel_key", KernelAttribute::get(ctx, shadow_key)}}; + {"kernel_key", KernelAttribute::get(ctx, shadow_key)}, + {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}}; auto out_type = AllocatedDenseTensorType::get( ctx, @@ -1699,7 +1700,8 @@ void AddShadowFeedForValue( std::unordered_map attr_map{ {"op_name", pir::StrAttribute::get(ctx, "pd_op.shadow_feed_tensors")}, {"kernel_name", pir::StrAttribute::get(ctx, "shadow_feed_tensors")}, - {"kernel_key", KernelAttribute::get(ctx, shadow_key)}}; + {"kernel_key", KernelAttribute::get(ctx, shadow_key)}, + {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}}; pir::OpInfo phi_kernel_op_info = ctx->GetRegisteredOpInfo(PhiKernelOp::name()); @@ -3001,6 +3003,52 @@ void AddShadowFeedOpForDataOrFeed( } } +/* + shadow_feed(x), y = memcpy_d2h(x), any_op(y) +=> shadow_feed(x, dst_place=cpu_place), any_op(x) + + shadow_feed(x), y = memcpy_h2d(x), any_op(y) +=> shadow_feed(x, dst_place=gpu_place), any_op(x) +*/ +void RemoveRedundantMemcpyAfterShadowFeed(pir::Block* block, + pir::IrContext* ctx) { + for (auto it = block->begin(); it != block->end(); ++it) { + if (it->isa() && + (it->dyn_cast().op_name() == "pd_op.shadow_feed" || + it->dyn_cast().op_name() == + "pd_op.shadow_feed_tensors")) { + pir::Value shadow_value = it->result(0); + if (shadow_value.use_count() == 1) { + pir::Operation* next_op = shadow_value.first_use().owner(); + bool is_memcpy_d2h = + next_op->isa() && + next_op->dyn_cast().op_name() == "pd_op.memcpy_d2h"; + bool is_memcpy_h2d = + next_op->isa() && + next_op->dyn_cast().op_name() == "pd_op.memcpy_h2d"; + + if (is_memcpy_d2h || is_memcpy_h2d) { + VLOG(6) << "Remove redundant memcpy op after shadow_feed"; + VLOG(6) << *it; + VLOG(6) << next_op; + VLOG(6) << "==>"; + + // remove memcpy op + next_op->result(0).ReplaceAllUsesWith(shadow_value); + block->erase(next_op->operator pir::Block::ConstIterator()); + + // set dst_place_type for shadow_feed, 0 for cpu_place, 1 for + // gpu_place + int dst_place_type = is_memcpy_d2h ? 0 : 1; + it->set_attribute("dst_place_type", + pir::Int32Attribute::get(ctx, dst_place_type)); + VLOG(6) << *it; + } + } + } + } +} + pir::Operation* BuildKernelOp( const std::string& kernel_fn_str, const phi::KernelKey& kernel_key, @@ -3215,7 +3263,8 @@ void ProcessBlock( std::unordered_map attr_map{ {"op_name", pir::StrAttribute::get(ctx, "pd_op.shadow_feed")}, {"kernel_name", pir::StrAttribute::get(ctx, "shadow_feed")}, - {"kernel_key", KernelAttribute::get(ctx, shadow_key)}}; + {"kernel_key", KernelAttribute::get(ctx, shadow_key)}, + {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}}; auto out_type = AllocatedDenseTensorType::get(ctx, place, dense_tensor_type); @@ -3356,6 +3405,8 @@ void ProcessBlock( AddShadowFeedOpForDataOrFeed( place, op_item, op, new_block, ctx, map_op_pair, map_value_pair); } + + RemoveRedundantMemcpyAfterShadowFeed(new_block, ctx); } std::unique_ptr PdOpLowerToKernelPass(pir::Program* prog, diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.cc b/paddle/fluid/pir/transforms/sub_graph_detector.cc index 7b8ec4bbf4cf2..39daea9a56eda 100644 --- a/paddle/fluid/pir/transforms/sub_graph_detector.cc +++ b/paddle/fluid/pir/transforms/sub_graph_detector.cc @@ -217,7 +217,7 @@ static std::string OpsDebugStr(std::vector ops) { std::stringstream ss; pir::IrPrinter printer(ss); for (const auto* op : ops) { - printer.PrintOperation(const_cast(op)); + printer.PrintOperation(*op); ss << "{" << op->id() << "}\n"; } return ss.str(); diff --git a/paddle/phi/kernels/data_kernel.h b/paddle/phi/kernels/data_kernel.h index 94d33f7e7ca98..96242f3ef94d4 100644 --- a/paddle/phi/kernels/data_kernel.h +++ b/paddle/phi/kernels/data_kernel.h @@ -34,11 +34,13 @@ void ShadowOutputKernel(const Context& ctx, template void ShadowFeedKernel(const Context& ctx, const DenseTensor& x, + int dst_place_type, DenseTensor* out); template void ShadowFeedTensorsKernel(const Context& ctx, const std::vector& xs, + int dst_place_type, std::vector outs); template diff --git a/paddle/phi/kernels/impl/data_impl.h b/paddle/phi/kernels/impl/data_impl.h index fb089d1664535..487840353cb12 100644 --- a/paddle/phi/kernels/impl/data_impl.h +++ b/paddle/phi/kernels/impl/data_impl.h @@ -26,25 +26,52 @@ const char kBackward[] = "BACKWARD"; template void ShadowFeedKernel(const Context& ctx, const DenseTensor& x, + int dst_place_type, DenseTensor* out) { + Place target_place; + switch (dst_place_type) { + case 0: // CPUPlace + target_place = CPUPlace(); + break; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + case 1: // CUDAPlace + target_place = GPUPlace(backends::gpu::GetCurrentDeviceId()); + break; +#elif defined(PADDLE_WITH_XPU) + case 1: // XPUPlace + target_place = XPUPlace(backends::xpu::GetXPUCurrentDeviceId()); + break; +#endif + default: + PADDLE_THROW(errors::Unimplemented("dst_place_type: %d is not supported.", + dst_place_type)); + break; + } + if (!x.initialized()) { - ctx.template Alloc(out); + if (target_place == CPUPlace()) { + ctx.template HostAlloc(out); + } else { + ctx.template Alloc(out); + } return; } - if (x.place() == ctx.GetPlace()) { + + if (x.place() == target_place) { out->ShareDataWith(x); out->set_lod(x.lod()); } else { - phi::Copy(ctx, x, ctx.GetPlace(), true, out); + phi::Copy(ctx, x, target_place, true, out); } } template void ShadowFeedTensorsKernel(const Context& ctx, const std::vector& xs, + int dst_place_type, std::vector outs) { for (size_t i = 0; i < xs.size(); ++i) { - ShadowFeedKernel(ctx, *(xs[i]), outs[i]); + ShadowFeedKernel(ctx, *(xs[i]), dst_place_type, outs[i]); } } diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index 583d912d904d6..b427ae205d970 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -888,25 +888,25 @@ interfaces : paddle::dialect::InferSymbolicShapeInterface - op : shadow_feed - args : (Tensor x) + args : (Tensor x, int dst_place_type) output : Tensor(out) infer_meta: func: UnchangedInferMeta param: [x] kernel: func: shadow_feed - param: [x] + param: [x, dst_place_type] interfaces : paddle::dialect::InferSymbolicShapeInterface - op : shadow_feed_tensors - args : (Tensor[] x) + args : (Tensor[] x, int dst_place_type) output : Tensor[](out){x.size()} infer_meta: func: UnchangedVectorInferMeta param: [x] kernel: func: shadow_feed_tensors - param: [x] + param: [x, dst_place_type] - op : share_data_ args : (Tensor x) diff --git a/paddle/pir/include/core/block.h b/paddle/pir/include/core/block.h index 3756e738b22bb..c4479686c69e7 100644 --- a/paddle/pir/include/core/block.h +++ b/paddle/pir/include/core/block.h @@ -198,6 +198,8 @@ class IR_API Block { Region *parent_; // not owned }; +std::ostream &operator<<(std::ostream &os, const Block &block); + template void Block::AddArgs(TypeIter first, TypeIter last) { while (first != last) { diff --git a/paddle/pir/include/core/builtin_op.h b/paddle/pir/include/core/builtin_op.h index eee9345e93cac..bcdc560d6cb4b 100644 --- a/paddle/pir/include/core/builtin_op.h +++ b/paddle/pir/include/core/builtin_op.h @@ -211,7 +211,7 @@ class IR_API SplitOp : public pir::Op { class IR_API ConstantLikeTrait : public OpTraitBase { public: - explicit ConstantLikeTrait(Operation *op) + explicit ConstantLikeTrait(const Operation *op) : OpTraitBase(op) {} }; @@ -244,7 +244,7 @@ class IR_API ConstantTensorOp : public ConstantOp { public: using ConstantOp::ConstantOp; - static ConstantTensorOp dyn_cast(Operation *op); + static ConstantTensorOp dyn_cast(const Operation *op); static bool classof(const Operation *op); static void Build(Builder &builder, // NOLINT diff --git a/paddle/pir/include/core/dialect.h b/paddle/pir/include/core/dialect.h index 9beb859d11a26..ec5f16714e03c 100644 --- a/paddle/pir/include/core/dialect.h +++ b/paddle/pir/include/core/dialect.h @@ -33,7 +33,7 @@ class IrParser; class DialectInterface; using OpPrintFn = - std::function; // NOLINT + std::function; // NOLINT /// /// \brief Dialect can basically be understood as a namespace. In Dialect, we @@ -158,7 +158,7 @@ class IR_API Dialect { IR_THROW("dialect has no registered attribute parsing hook"); } - virtual OpPrintFn PrintOperation(Operation *op) const; + virtual OpPrintFn PrintOperation(const Operation &op) const; virtual Operation ParseOperation(IrParser &parser) { // NOLINT IR_THROW("dialect has no registered operation parsing hook"); diff --git a/paddle/pir/include/core/ir_printer.h b/paddle/pir/include/core/ir_printer.h index 6e066911f9c8b..4bb08c5d25df5 100644 --- a/paddle/pir/include/core/ir_printer.h +++ b/paddle/pir/include/core/ir_printer.h @@ -49,26 +49,26 @@ class IR_API IrPrinter : public BasicIrPrinter { void PrintProgram(const Program* program); /// @brief dispatch to custom printer function or PrintGeneralOperation - virtual void PrintOperation(Operation* op); + virtual void PrintOperation(const Operation& op); /// @brief print operation itself without its regions - void PrintOperationWithNoRegion(Operation* op); + void PrintOperationWithNoRegion(const Operation& op); /// @brief print operation and its regions - void PrintGeneralOperation(Operation* op); + void PrintGeneralOperation(const Operation& op); void PrintRegion(const Region& Region); void PrintBlock(const Block& block); virtual void PrintValue(Value v); - void PrintOpResult(Operation* op); + void PrintOpResult(const Operation& op); - void PrintAttributeMap(Operation* op); + void PrintAttributeMap(const Operation& op); - void PrintOpOperands(Operation* op); + void PrintOpOperands(const Operation& op); - void PrintOperandsType(Operation* op); + void PrintOperandsType(const Operation& op); - void PrintOpReturnType(Operation* op); + void PrintOpReturnType(const Operation& op); void AddValueAlias(Value value, const std::string& alias); @@ -90,7 +90,7 @@ using TypePrintHook = using AttributePrintHook = std::function; // NOLINT using OpPrintHook = - std::function; // NOLINT + std::function; // NOLINT struct IR_API PrintHooks { ValuePrintHook value_print_hook{nullptr}; diff --git a/paddle/pir/include/core/op_base.h b/paddle/pir/include/core/op_base.h index b5cd58e4a91cf..90e1ab2f6fe41 100644 --- a/paddle/pir/include/core/op_base.h +++ b/paddle/pir/include/core/op_base.h @@ -29,7 +29,8 @@ class Block; class IR_API OpBase { public: - explicit OpBase(Operation *operation = nullptr) : operation_(operation) {} + explicit OpBase(const Operation *operation = nullptr) + : operation_(const_cast(operation)) {} Operation *operation() const { PADDLE_ENFORCE_NOT_NULL( @@ -91,11 +92,11 @@ template class OpTraitBase : public OpBase { public: using Base = OpTraitBase; - explicit OpTraitBase(Operation *op) : OpBase(op) {} + explicit OpTraitBase(const Operation *op) : OpBase(op) {} static TypeId GetTraitId() { return TypeId::get(); } - static ConcreteTrait dyn_cast(Operation *op) { + static ConcreteTrait dyn_cast(const Operation *op) { if (op && op->HasTrait()) { return ConcreteTrait(op); } @@ -109,7 +110,7 @@ class OpTraitBase : public OpBase { template class OpInterfaceBase : public OpBase { public: - explicit OpInterfaceBase(Operation *op) : OpBase(op) {} + explicit OpInterfaceBase(const Operation *op) : OpBase(op) {} /// /// \brief Accessor for the ID of this interface. @@ -123,7 +124,7 @@ class OpInterfaceBase : public OpBase { return op->HasInterface(); } - static ConcreteInterface dyn_cast(Operation *op) { + static ConcreteInterface dyn_cast(const Operation *op) { if (op && op->HasInterface()) { return ConcreteInterface( op, op->info().GetInterfaceImpl()); @@ -158,7 +159,7 @@ class Op : public OpBase { std::tuple>::Type; // TODO(zhangbopd): Use classof - static ConcreteOp dyn_cast(Operation *op) { + static ConcreteOp dyn_cast(const Operation *op) { if (op && op->info().id() == TypeId::get()) { return ConcreteOp(op); } diff --git a/paddle/pir/include/core/op_trait.h b/paddle/pir/include/core/op_trait.h index dc308645cd034..930697d13dd0e 100644 --- a/paddle/pir/include/core/op_trait.h +++ b/paddle/pir/include/core/op_trait.h @@ -26,7 +26,7 @@ namespace pir { class IR_API SameOperandsShapeTrait : public pir::OpTraitBase { public: - explicit SameOperandsShapeTrait(pir::Operation *op) + explicit SameOperandsShapeTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(Operation *op); }; @@ -38,7 +38,7 @@ class IR_API SameOperandsShapeTrait class IR_API SameOperandsAndResultShapeTrait : public pir::OpTraitBase { public: - explicit SameOperandsAndResultShapeTrait(pir::Operation *op) + explicit SameOperandsAndResultShapeTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(Operation *op); }; @@ -50,7 +50,7 @@ class IR_API SameOperandsAndResultShapeTrait class IR_API SameOperandsElementTypeTrait : public pir::OpTraitBase { public: - explicit SameOperandsElementTypeTrait(pir::Operation *op) + explicit SameOperandsElementTypeTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(Operation *op); }; @@ -62,7 +62,7 @@ class IR_API SameOperandsElementTypeTrait class IR_API SameOperandsAndResultElementTypeTrait : public pir::OpTraitBase { public: - explicit SameOperandsAndResultElementTypeTrait(pir::Operation *op) + explicit SameOperandsAndResultElementTypeTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(Operation *op); }; @@ -75,7 +75,7 @@ class IR_API SameOperandsAndResultElementTypeTrait class IR_API SameOperandsAndResultTypeTrait : public pir::OpTraitBase { public: - explicit SameOperandsAndResultTypeTrait(pir::Operation *op) + explicit SameOperandsAndResultTypeTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(Operation *op); @@ -88,7 +88,7 @@ class IR_API SameOperandsAndResultTypeTrait class IR_API SameTypeOperandsTrait : public pir::OpTraitBase { public: - explicit SameTypeOperandsTrait(pir::Operation *op) + explicit SameTypeOperandsTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(Operation *op); }; diff --git a/paddle/pir/include/core/operation.h b/paddle/pir/include/core/operation.h index b5f60690d8190..5d4a17f21452c 100644 --- a/paddle/pir/include/core/operation.h +++ b/paddle/pir/include/core/operation.h @@ -184,7 +184,7 @@ class IR_API alignas(8) Operation final operator Block::ConstIterator() const { return position_; } void MoveTo(Block *block, Block::Iterator position); - void Print(std::ostream &os); + void Print(std::ostream &os) const; pir::OpInfo info() const { return info_; } std::string name() const; @@ -207,7 +207,7 @@ class IR_API alignas(8) Operation final bool use_empty(); template - T dyn_cast() { + T dyn_cast() const { return CastUtil::call(this); } @@ -256,7 +256,7 @@ class IR_API alignas(8) Operation final template struct CastUtil { - static To call(Operation *op) { + static To call(const Operation *op) { throw("Can't dyn_cast to To, To should be a Op or Trait or Interface"); } }; @@ -269,7 +269,7 @@ class IR_API alignas(8) Operation final struct CastUtil< To, typename std::enable_if::value>::type> { - static To call(Operation *op) { return To::dyn_cast(op); } + static To call(const Operation *op) { return To::dyn_cast(op); } }; AttributeMap attributes_; @@ -296,4 +296,6 @@ class IR_API alignas(8) Operation final Block::Iterator position_; }; +IR_API std::ostream &operator<<(std::ostream &os, const Operation &op); + } // namespace pir diff --git a/paddle/pir/include/dialect/control_flow/ir/cf_dialect.h b/paddle/pir/include/dialect/control_flow/ir/cf_dialect.h index 8a34c18213a60..be0ab54ca62bb 100644 --- a/paddle/pir/include/dialect/control_flow/ir/cf_dialect.h +++ b/paddle/pir/include/dialect/control_flow/ir/cf_dialect.h @@ -25,7 +25,7 @@ class ControlFlowDialect : public Dialect { } static const char *name() { return "cf"; } TEST_API void PrintType(Type type, std::ostream &os) const override; - TEST_API OpPrintFn PrintOperation(Operation *op) const override; + TEST_API OpPrintFn PrintOperation(const Operation &op) const override; private: TEST_API void initialize(); diff --git a/paddle/pir/include/dialect/control_flow/ir/cf_interface.h b/paddle/pir/include/dialect/control_flow/ir/cf_interface.h index 81ebb3649c681..21ca55b872704 100644 --- a/paddle/pir/include/dialect/control_flow/ir/cf_interface.h +++ b/paddle/pir/include/dialect/control_flow/ir/cf_interface.h @@ -77,7 +77,7 @@ class ContainerOpInterface : public OpInterfaceBase { TuplePushOp tuple_push_op(); TuplePopOp tuple_pop_op(); /// Constructor - ContainerOpInterface(pir::Operation* op, Concept* impl) + ContainerOpInterface(const pir::Operation* op, Concept* impl) : OpInterfaceBase(op), impl_(impl) {} private: diff --git a/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/cache_grad_op_symbolic_shape.h b/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/cache_grad_op_symbolic_shape.h index 1e5d34a2cbf80..1489536769f26 100644 --- a/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/cache_grad_op_symbolic_shape.h +++ b/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/cache_grad_op_symbolic_shape.h @@ -46,7 +46,7 @@ class CacheGradOpSymbolicShapeInterface }; /// Constructor - CacheGradOpSymbolicShapeInterface(pir::Operation *op, Concept *impl) + CacheGradOpSymbolicShapeInterface(const pir::Operation *op, Concept *impl) : pir::OpInterfaceBase(op), impl_(impl) {} diff --git a/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.h b/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.h index bd4b16bbc75fd..20a908141ef48 100644 --- a/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.h +++ b/paddle/pir/include/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.h @@ -48,7 +48,7 @@ class InferSymbolicShapeInterface }; /// Constructor - InferSymbolicShapeInterface(pir::Operation *op, Concept *impl) + InferSymbolicShapeInterface(const pir::Operation *op, Concept *impl) : pir::OpInterfaceBase(op), impl_(impl) {} bool InferSymbolicShape(pir::InferSymbolicShapeContext *infer_context); diff --git a/paddle/pir/src/core/builtin_op.cc b/paddle/pir/src/core/builtin_op.cc index b6d3eb5d90dbd..1c1dd10fb97ad 100644 --- a/paddle/pir/src/core/builtin_op.cc +++ b/paddle/pir/src/core/builtin_op.cc @@ -171,15 +171,15 @@ void GroupOp::VerifySig() {} void GroupOp::Print(IrPrinter &printer) { auto &os = printer.os; auto op = operation(); - printer.PrintOpResult(op); + printer.PrintOpResult(*op); os << " = \"" << name() << "\" [id:" << op->id() << "]"; - printer.PrintOpOperands(op); + printer.PrintOpOperands(*op); os << " -> "; - printer.PrintOpReturnType(op); + printer.PrintOpReturnType(*op); os << " {\n"; printer.AddIndentation(); for (auto &sub_op : GetOperators()) { - printer.PrintOperation(sub_op); + printer.PrintOperation(*sub_op); os << "\n"; } printer.DecreaseIndentation(); @@ -644,7 +644,7 @@ void ConstantTensorOp::VerifySig() const { common::errors::InvalidArgument("Type of value must be str attribute")); } -ConstantTensorOp ConstantTensorOp::dyn_cast(Operation *op) { +ConstantTensorOp ConstantTensorOp::dyn_cast(const Operation *op) { if (ConstantTensorOp::classof(op)) return ConstantTensorOp(op); return ConstantTensorOp(nullptr); } diff --git a/paddle/pir/src/core/dialect.cc b/paddle/pir/src/core/dialect.cc index 668c56111d0ac..80636c67fe0fc 100644 --- a/paddle/pir/src/core/dialect.cc +++ b/paddle/pir/src/core/dialect.cc @@ -28,7 +28,7 @@ void Dialect::RegisterInterface(std::unique_ptr interface) { std::move(interface)); } -OpPrintFn Dialect::PrintOperation(Operation *op) const { return nullptr; } +OpPrintFn Dialect::PrintOperation(const Operation &op) const { return nullptr; } DialectInterface::~DialectInterface() = default; diff --git a/paddle/pir/src/core/ir_printer.cc b/paddle/pir/src/core/ir_printer.cc index 57ac01c256319..ffe6447425f77 100644 --- a/paddle/pir/src/core/ir_printer.cc +++ b/paddle/pir/src/core/ir_printer.cc @@ -172,10 +172,10 @@ void IrPrinter::PrintProgram(const Program* program) { } } -void IrPrinter::PrintOperation(Operation* op) { +void IrPrinter::PrintOperation(const Operation& op) { os << indentation(); - if (auto* dialect = op->dialect()) { + if (auto* dialect = op.dialect()) { if (auto print_fn = dialect->PrintOperation(op)) { print_fn(op, *this); return; @@ -185,15 +185,15 @@ void IrPrinter::PrintOperation(Operation* op) { PrintGeneralOperation(op); } -void IrPrinter::PrintOperationWithNoRegion(Operation* op) { +void IrPrinter::PrintOperationWithNoRegion(const Operation& op) { // TODO(lyk): add API to get opresults directly PrintOpResult(op); os << " ="; - os << " \"" << op->name() << "\""; + os << " \"" << op.name() << "\""; if (VLOG_IS_ON(1) || FLAGS_pir_debug) { - os << " [id:" << op->id() << "]"; + os << " [id:" << op.id() << "]"; } // TODO(lyk): add API to get operands directly @@ -210,13 +210,13 @@ void IrPrinter::PrintOperationWithNoRegion(Operation* op) { PrintOpReturnType(op); } -void IrPrinter::PrintGeneralOperation(Operation* op) { +void IrPrinter::PrintGeneralOperation(const Operation& op) { PrintOperationWithNoRegion(op); - if (op->num_regions() > 0) { + if (op.num_regions() > 0) { os << newline; } - for (size_t i = 0; i < op->num_regions(); ++i) { - auto& region = op->region(i); + for (size_t i = 0; i < op.num_regions(); ++i) { + auto& region = op.region(i); PrintRegion(region); } } @@ -241,7 +241,7 @@ void IrPrinter::PrintBlock(const Block& block) { os << "\n"; } for (auto& item : block) { - PrintOperation(&item); + PrintOperation(item); os << "\n"; } DecreaseIndentation(); @@ -273,13 +273,13 @@ void IrPrinter::PrintValue(Value v) { } } -void IrPrinter::PrintOpResult(Operation* op) { +void IrPrinter::PrintOpResult(const Operation& op) { os << "("; - auto num_op_result = op->num_results(); + auto num_op_result = op.num_results(); std::vector op_results; op_results.reserve(num_op_result); for (size_t idx = 0; idx < num_op_result; idx++) { - op_results.push_back(op->result(idx)); + op_results.push_back(op.result(idx)); } pir::detail::PrintInterleave( op_results.begin(), @@ -289,8 +289,8 @@ void IrPrinter::PrintOpResult(Operation* op) { os << ")"; } -void IrPrinter::PrintAttributeMap(Operation* op) { - AttributeMap attributes = op->attributes(); +void IrPrinter::PrintAttributeMap(const Operation& op) { + AttributeMap attributes = op.attributes(); std::map> order_attributes( attributes.begin(), attributes.end()); @@ -312,13 +312,13 @@ void IrPrinter::PrintAttributeMap(Operation* op) { os << "}"; } -void IrPrinter::PrintOpOperands(Operation* op) { +void IrPrinter::PrintOpOperands(const Operation& op) { os << " ("; - auto num_op_operands = op->num_operands(); + auto num_op_operands = op.num_operands(); std::vector op_operands; op_operands.reserve(num_op_operands); for (size_t idx = 0; idx < num_op_operands; idx++) { - op_operands.push_back(op->operand_source(idx)); + op_operands.push_back(op.operand_source(idx)); } pir::detail::PrintInterleave( op_operands.begin(), @@ -328,12 +328,12 @@ void IrPrinter::PrintOpOperands(Operation* op) { os << ")"; } -void IrPrinter::PrintOperandsType(Operation* op) { - auto num_op_operands = op->num_operands(); +void IrPrinter::PrintOperandsType(const Operation& op) { + auto num_op_operands = op.num_operands(); std::vector op_operand_types; op_operand_types.reserve(num_op_operands); for (size_t idx = 0; idx < num_op_operands; idx++) { - auto op_operand = op->operand(idx); + auto op_operand = op.operand(idx); if (op_operand) { op_operand_types.push_back(op_operand.type()); } else { @@ -349,12 +349,12 @@ void IrPrinter::PrintOperandsType(Operation* op) { os << ")"; } -void IrPrinter::PrintOpReturnType(Operation* op) { - auto num_op_result = op->num_results(); +void IrPrinter::PrintOpReturnType(const Operation& op) { + auto num_op_result = op.num_results(); std::vector op_result_types; op_result_types.reserve(num_op_result); for (size_t idx = 0; idx < num_op_result; idx++) { - auto op_result = op->result(idx); + auto op_result = op.result(idx); if (op_result) { op_result_types.push_back(op_result.type()); } else { @@ -397,7 +397,7 @@ class CustomPrinter : public IrPrinter { } } - void PrintOperation(Operation* op) override { + void PrintOperation(const Operation& op) override { if (hooks_.op_print_hook) { hooks_.op_print_hook(op, *this); } else { @@ -428,9 +428,9 @@ void Program::Print(std::ostream& os) const { printer.PrintProgram(this); } -void Operation::Print(std::ostream& os) { +void Operation::Print(std::ostream& os) const { IrPrinter printer(os); - printer.PrintOperation(this); + printer.PrintOperation(*this); } void Value::Print(std::ostream& os) const { @@ -458,6 +458,12 @@ std::ostream& operator<<(std::ostream& os, Attribute attr) { return os; } +std::ostream& operator<<(std::ostream& os, const Block& block) { + IrPrinter printer(os); + printer.PrintBlock(block); + return os; +} + std::ostream& operator<<(std::ostream& os, const Program& prog) { prog.Print(os); return os; diff --git a/paddle/pir/src/core/op_trait.cc b/paddle/pir/src/core/op_trait.cc index 3c2a15e8b80b6..f583220575fa2 100644 --- a/paddle/pir/src/core/op_trait.cc +++ b/paddle/pir/src/core/op_trait.cc @@ -20,7 +20,7 @@ namespace { -void VerifySameOperandsShapeTrait(pir::Operation *op) { +void VerifySameOperandsShapeTrait(const pir::Operation *op) { VLOG(10) << "Verify SameOperandsShapeTrait for : " << op->name(); PADDLE_ENFORCE_GT( @@ -47,7 +47,7 @@ void VerifySameOperandsShapeTrait(pir::Operation *op) { op->name())); } -void VerifySameOperandsAndResultShapeTrait(pir::Operation *op) { +void VerifySameOperandsAndResultShapeTrait(const pir::Operation *op) { VLOG(10) << "Verify SameOperandsAndResultShapeTrait for : " << op->name(); PADDLE_ENFORCE_GT( @@ -90,7 +90,7 @@ void VerifySameOperandsAndResultShapeTrait(pir::Operation *op) { op->name())); } -void VerifySameOperandsElementTypeTrait(pir::Operation *op) { +void VerifySameOperandsElementTypeTrait(const pir::Operation *op) { VLOG(10) << "Verify SameOperandsElementTypeTrait for : " << op->name(); PADDLE_ENFORCE_GT( @@ -114,7 +114,7 @@ void VerifySameOperandsElementTypeTrait(pir::Operation *op) { } } -void VerifySameOperandsAndResultElementTypeTrait(pir::Operation *op) { +void VerifySameOperandsAndResultElementTypeTrait(const pir::Operation *op) { VLOG(10) << "Verify SameOperandsAndResultElementTypeTrait for : " << op->name(); @@ -161,7 +161,7 @@ void VerifySameOperandsAndResultElementTypeTrait(pir::Operation *op) { } } -void VerifySameOperandsAndResultTypeTrait(pir::Operation *op) { +void VerifySameOperandsAndResultTypeTrait(const pir::Operation *op) { VLOG(10) << "Verify SameOperandsAndResultTypeTrait for : " << op->name(); PADDLE_ENFORCE_GT( @@ -222,7 +222,7 @@ void VerifySameOperandsAndResultTypeTrait(pir::Operation *op) { } } -void VerifySameTypeOperandsTrait(pir::Operation *op) { +void VerifySameTypeOperandsTrait(const pir::Operation *op) { VLOG(10) << "Verify SameTypeOperandsTrait for : " << op->name(); // For zero or only one operand. @@ -242,7 +242,7 @@ void VerifySameTypeOperandsTrait(pir::Operation *op) { } } -void VerifyOneResultTrait(pir::Operation *op) { +void VerifyOneResultTrait(const pir::Operation *op) { PADDLE_ENFORCE_EQ( op->num_results(), 1, diff --git a/paddle/pir/src/core/operation.cc b/paddle/pir/src/core/operation.cc index 957720cfdbdc8..bf7a42cb15bd8 100644 --- a/paddle/pir/src/core/operation.cc +++ b/paddle/pir/src/core/operation.cc @@ -460,4 +460,10 @@ void *Operation::value_property(const std::string &key, size_t index) const { COMPONENT_IMPL(op_result, OpResult) COMPONENT_IMPL(op_operand, OpOperand) + +IR_API std::ostream &operator<<(std::ostream &os, const Operation &op) { + op.Print(os); + return os; +} + } // namespace pir diff --git a/paddle/pir/src/dialect/control_flow/ir/cf_dialect.cc b/paddle/pir/src/dialect/control_flow/ir/cf_dialect.cc index 09bbdf0bd6077..5858e7cc88a56 100644 --- a/paddle/pir/src/dialect/control_flow/ir/cf_dialect.cc +++ b/paddle/pir/src/dialect/control_flow/ir/cf_dialect.cc @@ -36,10 +36,11 @@ void ControlFlowDialect::PrintType(pir::Type type, std::ostream& os) const { } } -pir::OpPrintFn ControlFlowDialect::PrintOperation(pir::Operation* op) const { - if (auto create_op = op->dyn_cast()) { - return [](pir::Operation* op, pir::IrPrinter& printer) { - auto create_op = op->dyn_cast(); +pir::OpPrintFn ControlFlowDialect::PrintOperation( + const pir::Operation& op) const { + if (auto create_op = op.dyn_cast()) { + return [](const pir::Operation& op, pir::IrPrinter& printer) { + auto create_op = op.dyn_cast(); create_op.Print(printer); }; } diff --git a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc index 9fb12cf1b78b4..6b66ba21478ec 100644 --- a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc +++ b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc @@ -250,7 +250,7 @@ void StackCreateOp::Print(IrPrinter &printer) { // NOLINT printer.AddValueAlias(inlet(), "%inlet_" + std::to_string(index)); printer.AddValueAlias(outlet(), "%outlet_" + std::to_string(index)); } - printer.PrintGeneralOperation(*this); + printer.PrintGeneralOperation(**this); } } // namespace pir diff --git a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc index 090a2edff94fe..e594e83340604 100644 --- a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc +++ b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc @@ -98,14 +98,14 @@ std::string PrintOperationWithNoRegion(Operation* op) { } os << ")"; - printer.PrintAttributeMap(op); + printer.PrintAttributeMap(*op); os << " :"; // PrintOpSignature - printer.PrintOperandsType(op); + printer.PrintOperandsType(*op); os << " -> "; - printer.PrintOpReturnType(op); + printer.PrintOpReturnType(*op); return os.str(); } diff --git a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc index 6c1eed5457528..f24bccfdb6be5 100644 --- a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc +++ b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc @@ -736,21 +736,21 @@ symbol::DimExpr ShapeConstraintIRAnalysis::GetProductDimExpr( pir::PrintHooks ShapeConstraintIRAnalysis::PrintHook() { pir::PrintHooks print_hook; - print_hook.op_print_hook = [&](Operation* op, IrPrinter& printer) { + print_hook.op_print_hook = [&](const Operation& op, IrPrinter& printer) { printer.IrPrinter::PrintOperation(op); printer.os << " { "; - for (uint32_t i = 0; i < op->num_results(); ++i) { - if (context_.HasShapeOrDataForValue(op->result(i))) { - printer.os << "(" << this->GetShapeOrDataForValue(op->result(i)) << ")"; + for (uint32_t i = 0; i < op.num_results(); ++i) { + if (context_.HasShapeOrDataForValue(op.result(i))) { + printer.os << "(" << this->GetShapeOrDataForValue(op.result(i)) << ")"; } else { printer.os << "()"; } - if (i < op->num_results() - 1) { + if (i < op.num_results() - 1) { printer.os << ", "; } } printer.os << " }"; - printer.os << "\t(op_" << op->id() << ")"; + printer.os << "\t(op_" << op.id() << ")"; }; return print_hook; } diff --git a/test/cpp/pir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc index a9ce28f69d22f..7c780d268f0c1 100644 --- a/test/cpp/pir/core/ir_op_test.cc +++ b/test/cpp/pir/core/ir_op_test.cc @@ -530,11 +530,11 @@ TEST(printer_test, custom_hooks) { printer.os << " [extra info]"; }; // this one overrides old printing - hooks.op_print_hook = [](pir::Operation *op, pir::IrPrinter &printer) { + hooks.op_print_hook = [](const pir::Operation &op, pir::IrPrinter &printer) { printer.PrintOpResult(op); printer.os << " :="; - printer.os << " \"" << op->name() << "\""; + printer.os << " \"" << op.name() << "\""; printer.PrintOpOperands(op); printer.PrintAttributeMap(op); printer.os << " :"; diff --git a/test/cpp/pir/tools/test1_dialect.cc b/test/cpp/pir/tools/test1_dialect.cc index 1a575629a69b1..0e4cd5105fdc6 100644 --- a/test/cpp/pir/tools/test1_dialect.cc +++ b/test/cpp/pir/tools/test1_dialect.cc @@ -25,12 +25,12 @@ void Test1Dialect::initialize() { RegisterOps(); } -pir::OpPrintFn Test1Dialect::PrintOperation(pir::Operation *op) const { - return [](pir::Operation *op, pir::IrPrinter &printer) { +pir::OpPrintFn Test1Dialect::PrintOperation(const pir::Operation &op) const { + return [](const pir::Operation &op, pir::IrPrinter &printer) { printer.PrintOpResult(op); printer.os << " ="; - printer.os << " \"" << op->name() << "\""; + printer.os << " \"" << op.name() << "\""; printer.PrintOpOperands(op); }; } diff --git a/test/cpp/pir/tools/test1_dialect.h b/test/cpp/pir/tools/test1_dialect.h index c6124430538a0..dfcfee6f4b32e 100644 --- a/test/cpp/pir/tools/test1_dialect.h +++ b/test/cpp/pir/tools/test1_dialect.h @@ -22,7 +22,7 @@ class Test1Dialect : public pir::Dialect { public: explicit Test1Dialect(pir::IrContext *context); static const char *name() { return "test1"; } - pir::OpPrintFn PrintOperation(pir::Operation *op) const override; + pir::OpPrintFn PrintOperation(const pir::Operation &op) const override; private: void initialize(); diff --git a/test/cpp/pir/tools/test_dialect.cc b/test/cpp/pir/tools/test_dialect.cc index e088c4809ff67..5a6bf7cea8ac1 100644 --- a/test/cpp/pir/tools/test_dialect.cc +++ b/test/cpp/pir/tools/test_dialect.cc @@ -41,12 +41,12 @@ void TestDialect::initialize() { SameOperandsAndResultTypeTraitOp3>(); } -pir::OpPrintFn TestDialect::PrintOperation(pir::Operation *op) const { - return [](pir::Operation *op, pir::IrPrinter &printer) { +pir::OpPrintFn TestDialect::PrintOperation(const pir::Operation &op) const { + return [](const pir::Operation &op, pir::IrPrinter &printer) { printer.PrintOpResult(op); printer.os << " ="; - printer.os << " \"" << op->name() << "\""; + printer.os << " \"" << op.name() << "\""; printer.PrintOpOperands(op); }; } diff --git a/test/cpp/pir/tools/test_dialect.h b/test/cpp/pir/tools/test_dialect.h index fbf6200bc4782..758d1041c12c2 100644 --- a/test/cpp/pir/tools/test_dialect.h +++ b/test/cpp/pir/tools/test_dialect.h @@ -22,7 +22,7 @@ class TestDialect : public pir::Dialect { public: explicit TestDialect(pir::IrContext *context); static const char *name() { return "test"; } - pir::OpPrintFn PrintOperation(pir::Operation *op) const override; + pir::OpPrintFn PrintOperation(const pir::Operation &op) const override; private: void initialize(); diff --git a/test/cpp/pir/tools/test_trait.h b/test/cpp/pir/tools/test_trait.h index 203875e3094c4..c4ba540578311 100644 --- a/test/cpp/pir/tools/test_trait.h +++ b/test/cpp/pir/tools/test_trait.h @@ -22,13 +22,13 @@ namespace test { class ReadOnlyTrait : public pir::OpTraitBase { public: - explicit ReadOnlyTrait(pir::Operation *op) + explicit ReadOnlyTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} }; class OneRegionTrait : public pir::OpTraitBase { public: - explicit OneRegionTrait(pir::Operation *op) + explicit OneRegionTrait(const pir::Operation *op) : pir::OpTraitBase(op) {} static void Verify(pir::Operation *op); }; From 8db2e3d48286eb3ef2db6d4f4810bb14b0700877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=A0=E7=8C=9C?= Date: Tue, 8 Oct 2024 16:31:44 +0800 Subject: [PATCH 030/135] =?UTF-8?q?=E3=80=90Error=20Message=E3=80=91=20Mis?= =?UTF-8?q?cellaneous=20Modifications=20(#68528)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/cinn/adt/equation_solver.cc | 16 +-- .../hlir/framework/pir/trivial_op_impl.cc | 18 ++- paddle/cinn/hlir/op/contrib/argmin.cc | 23 +++- paddle/cinn/hlir/op/contrib/repeat.cc | 6 +- paddle/cinn/ir/schedule/impl/base.cc | 108 +++++++++--------- .../cinn/ir/schedule/impl/compute_location.cc | 36 +++--- paddle/cinn/ir/schedule/impl/for_type.cc | 14 +-- .../ir/schedule/impl/loop_transformation.cc | 50 ++++---- paddle/cinn/ir/schedule/impl/reduction.cc | 12 +- paddle/cinn/ir/schedule/impl/storage.cc | 40 ++++--- paddle/cinn/ir/schedule/ir_schedule_util.cc | 31 ++--- .../fusion_tracker/expr_utils.cc | 6 +- paddle/cinn/operator_fusion/utils.cc | 6 +- paddle/fluid/framework/barrier.h | 21 +++- .../tensorrt_engine_instruction.cc | 8 +- paddle/fluid/framework/tensor_util.cc | 4 +- .../multiary_infer_sym.cc | 26 ++--- .../infer_symbolic_shape/unary_infer_sym.cc | 14 +-- paddle/fluid/pybind/pybind.cc | 2 +- paddle/phi/infermeta/binary.cc | 4 +- paddle/phi/infermeta/spmd_rules/pad.cc | 16 +-- .../kernels/cpu/lookup_table_grad_kernel.cc | 6 +- paddle/phi/kernels/cpu/lookup_table_kernel.cc | 4 +- .../kernels/gpu/lookup_table_grad_kernel.cu | 2 +- .../cpu/lookup_table_grad_kernel.cc | 6 +- .../selected_rows/cpu/lookup_table_kernel.cc | 6 +- .../gpu/lookup_table_grad_kernel.cu | 2 +- .../phi/kernels/xpu/top_p_sampling_kernel.cc | 2 +- test/cpp/inference/api/tester_helper.h | 54 +++++++-- 29 files changed, 320 insertions(+), 223 deletions(-) diff --git a/paddle/cinn/adt/equation_solver.cc b/paddle/cinn/adt/equation_solver.cc index 6651bf38f495c..400dbf13a00b7 100644 --- a/paddle/cinn/adt/equation_solver.cc +++ b/paddle/cinn/adt/equation_solver.cc @@ -37,8 +37,8 @@ std::unordered_map InferValuesImpl( PADDLE_ENFORCE_EQ( ctx->HasValue(in_variable), true, - phi::errors::NotFound("The param id's out_iter must contain " - "its in_iter's value")); + ::common::errors::NotFound("The param id's out_iter must contain " + "its in_iter's value")); return {{out_iter.value(), ctx->GetValue(in_variable)}}; } @@ -49,8 +49,8 @@ std::unordered_map InferValuesImpl( PADDLE_ENFORCE_EQ( ctx->HasValue(in_variable), true, - phi::errors::NotFound("The param id's out_iter must contain " - "its in_iter's value")); + ::common::errors::NotFound("The param id's out_iter must contain " + "its in_iter's value")); return {{out_index.value(), ctx->GetValue(in_variable)}}; } @@ -215,7 +215,7 @@ std::unordered_map InferValuesImpl( PADDLE_ENFORCE_EQ( ret.emplace(out_msg_in_indexes.value()->at(i), value).second, true, - phi::errors::AlreadyExists([&]() { + ::common::errors::AlreadyExists([&]() { std::ostringstream oss; oss << "Failed to insert the variable '" << "out_msg_in_indexes.value()->at(" << i @@ -229,7 +229,7 @@ std::unordered_map InferValuesImpl( if (out_index.has_value()) { PADDLE_ENFORCE_EQ(ret.emplace(out_index.value(), value).second, true, - phi::errors::AlreadyExists([&]() { + ::common::errors::AlreadyExists([&]() { std::ostringstream oss; oss << "Failed to insert the variable '" << "out_index.value()" @@ -306,7 +306,9 @@ void SolveEquations( tValueInferSuccess has_unique_value = MergeInferedValuesIntoCtx(function, ctx); PADDLE_ENFORCE_EQ( - has_unique_value.value(), true, phi::errors::InvalidArgument([&]() { + has_unique_value.value(), + true, + ::common::errors::InvalidArgument([&]() { std::ostringstream oss; oss << "Failed to merge inferred values into the context for " "function '" diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc index a085183d50925..5ce62346d55d9 100644 --- a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc +++ b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc @@ -695,7 +695,15 @@ std::vector GetLoopStrides(const ir::Expr& body, auto* block = expr_block.As(); auto& iter_values = block->iter_values; auto& iter_vars = block->schedule_block.As()->iter_vars; - CHECK_EQ(iter_values.size(), iter_vars.size()); + PADDLE_ENFORCE_EQ( + iter_values.size(), + iter_vars.size(), + ::common::errors::InvalidArgument( + "The size of iter_values should be equal to iter_vars.\n" + "But now received: \n" + "iter_values: %d, and iter_vars: %d.", + iter_values.size(), + iter_vars.size())); const std::vector for_iters = trivial_fusion_detail::GetAllForIters(body); @@ -744,7 +752,13 @@ std::shared_ptr GetFusionGroupInfo( for (const auto& body : op_compute_bodies) { std::vector split_transform_block = GetSplitTransformBlock(body); if (!split_transform_block.empty()) { - CHECK_EQ(split_transform_block.size(), 1); + PADDLE_ENFORCE_EQ(split_transform_block.size(), + 1, + ::common::errors::InvalidArgument( + "The size of split_transform_block should be 1.\n" + "But received: \n" + "split_transform_block: %d.", + split_transform_block.size())); group_info->loop_strides = GetLoopStrides(body, split_transform_block[0]); } diff --git a/paddle/cinn/hlir/op/contrib/argmin.cc b/paddle/cinn/hlir/op/contrib/argmin.cc index 33e71632a86d6..b152da1ee2197 100644 --- a/paddle/cinn/hlir/op/contrib/argmin.cc +++ b/paddle/cinn/hlir/op/contrib/argmin.cc @@ -57,8 +57,14 @@ std::vector Argmin(const Tensor &in_tensor, if (axis < 0) { pos_axis = static_cast(ndim) + axis; } - CHECK_LT(pos_axis, ndim) << "Axis must be less than tensor's dim"; - CHECK_GE(pos_axis, 0) << "Axis must be more than 0"; + PADDLE_ENFORCE_LT(pos_axis, + ndim, + ::common::errors::InvalidArgument( + "[Error info] Axis must be less than tensor's dim.")); + PADDLE_ENFORCE_GE(pos_axis, + 0, + ::common::errors::InvalidArgument( + "[Error info] Axis must be more than 0.")); std::vector output_shape; for (int i = 0; i < shape.size(); ++i) { @@ -119,15 +125,22 @@ std::shared_ptr StrategyForArgmin( ::common::errors::InvalidArgument( "The input argument of argmin compute is empty! Please check.")); cinn::common::CINNValuePack pack_args = args[0]; - CHECK_GE(pack_args.size(), 1U) - << "There should be 1 input args for argmax compute"; + PADDLE_ENFORCE_GE( + pack_args.size(), + 1U, + ::common::errors::InvalidArgument( + "[Error info] There should be 1 input args for argmax compute.")); Expr in_expr = pack_args[0]; PADDLE_ENFORCE_NOT_NULL( in_expr.as_tensor(), ::common::errors::InvalidArgument( "The input argument of argmin compute is not tensor.")); Tensor in_tensor = in_expr.as_tensor_ref(); - CHECK_EQ(pack_args.size(), 2U); + PADDLE_ENFORCE_EQ( + pack_args.size(), + 2U, + ::common::errors::InvalidArgument("[Error info] The size of pack_args " + "should be equal to 2.")); PADDLE_ENFORCE_EQ( pack_args[1].is_string(), true, diff --git a/paddle/cinn/hlir/op/contrib/repeat.cc b/paddle/cinn/hlir/op/contrib/repeat.cc index de18e69adeabc..da988629f1826 100644 --- a/paddle/cinn/hlir/op/contrib/repeat.cc +++ b/paddle/cinn/hlir/op/contrib/repeat.cc @@ -157,7 +157,11 @@ std::shared_ptr StrategyForRepeat( VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ") << ", output_shapes: " << utils::Join(output_shapes[0], ", "); - CHECK_EQ(pack_args.size(), 2U); + PADDLE_ENFORCE_EQ( + pack_args.size(), + 2U, + ::common::errors::InvalidArgument( + "[Error info] The size of pack_args should equal to 2.")); std::string tensor_name = pack_args[1].operator std::string(); std::vector out = Repeat(tensor_A, repeats, axis, tensor_name); diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc index 49c39f62b0f05..e3bc2ab277f62 100644 --- a/paddle/cinn/ir/schedule/impl/base.cc +++ b/paddle/cinn/ir/schedule/impl/base.cc @@ -44,31 +44,31 @@ void DyScheduleImpl::MergeExprs() { if (exprs.size() <= 1U) return; PADDLE_ENFORCE_NOT_NULL( exprs[0].As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr[0] of module_expr should be a Block!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_EQ( exprs[0].As()->stmts.size(), 1U, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr[0] of module_expr should have only one stmt!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( exprs[0].As()->stmts[0].As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr[0] of module_expr should be Block with only one " "stmt which is " "a " "ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( @@ -77,14 +77,14 @@ void DyScheduleImpl::MergeExprs() { ->stmts[0] .As() ->schedule_block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr[0] of module_expr should be Block with only one " "stmt which is " "a " "ScheduleBlockRealize with a defined ScheduleBlock!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); std::vector merged_block; @@ -209,22 +209,22 @@ void DyScheduleImpl::Annotate(const Expr& block, PADDLE_ENFORCE_NOT_NULL( block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'block' must be a " "ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( block.As()->schedule_block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'block' must be a ScheduleBlockRealize " "with a defined ScheduleBlock!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); auto copied_block = ir::ir_utils::IRCopy(block); @@ -242,22 +242,22 @@ void DyScheduleImpl::Unannotate(Expr& block, PADDLE_ENFORCE_NOT_NULL( block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'block' must be a " "ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( block.As()->schedule_block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'block' must be a ScheduleBlockRealize " "with a defined ScheduleBlock!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); auto* schedule_block = block.As() @@ -280,22 +280,22 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_NOT_NULL( block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'block' must be a " "ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( block_target.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'block_target' must be a " "ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); auto exprs = this->GetModule().GetExprs(); @@ -303,11 +303,11 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ( exprs.size(), 1U, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Size of exprs of current module must be 1!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); auto expr = exprs[0]; @@ -325,12 +325,12 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ( vars[i]->upper_bound.defined() && vars_target[i]->upper_bound.defined(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Upper bound of iter_vars in both Expr parameter " "'block' and Expr parameter 'block_target' must be defined!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); if (vars[i]->upper_bound.is_constant() && @@ -348,7 +348,7 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ( !new_iter_values.empty(), true, - phi::errors::InvalidArgument([&]() -> std::string { + ::common::errors::InvalidArgument([&]() -> std::string { std::ostringstream oss; oss << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" @@ -356,7 +356,7 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, "source and target is not equal! " << vars[0]->upper_bound << " vs " << vars_target[0]->upper_bound << "\n" - << "[Error info] The Expr of current schedule is: " + << "[Expr info] The Expr of current schedule is: " << module_expr_.GetExprs() << "."; return oss.str(); }())); @@ -373,12 +373,12 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ(!used_target_loop_vars.empty(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] Cannot CopyTransformAndLoopInfo since " "there is no loop var in the new_iter_values!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); std::vector used_target_loops; @@ -393,13 +393,13 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, true); PADDLE_ENFORCE_EQ(find_loop_var.size(), 1U, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] Number of loop with iter_var which is " "used in ScheduleBlockRealize for indexing in " "Exprs[0] of module_exprs must be 1!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); used_target_loops.push_back(*find_loop_var.begin()); VLOG(3) << "used_target_loops push_back " << used_target_loops.back(); @@ -411,14 +411,14 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, for (int i = new_iter_values.size(); i < old_iter_values.size(); ++i) { PADDLE_ENFORCE_EQ(old_iter_values[i].as_var() != nullptr, true, - phi::errors::InvalidArgument([&]() -> std::string { + ::common::errors::InvalidArgument([&]() -> std::string { std::ostringstream oss; oss << "[IRScheduleError] An error occurred in the " "schedule primitive <" << primitive << ">.\n" << "[Error info] iter_vars[" << i << "] in Expr parameter 'block' must be vars!\n" - << "[Error info] The Expr of current schedule is: " + << "[Expr info] The Expr of current schedule is: " << module_expr_.GetExprs() << "."; return oss.str(); }())); @@ -433,14 +433,14 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, } else { PADDLE_ENFORCE_EQ(old_iter_values[changed_loop_num].as_var() != nullptr, true, - phi::errors::InvalidArgument([&]() -> std::string { + ::common::errors::InvalidArgument([&]() -> std::string { std::ostringstream oss; oss << "[IRScheduleError] An error occurred in the " "schedule primitive <" << primitive << ">.\n" << "[Error info] iter_vars[" << changed_loop_num << "] in Expr parameter 'block' must be vars!\n" - << "[Error info] The Expr of current schedule is: " + << "[Expr info] The Expr of current schedule is: " << module_expr_.GetExprs() << "."; return oss.str(); }())); @@ -457,14 +457,14 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ( find_partial_loop.size(), 1U, - phi::errors::InvalidArgument([&]() -> std::string { + ::common::errors::InvalidArgument([&]() -> std::string { std::ostringstream oss; oss << "[IRScheduleError] An error occurred in the schedule " "primitive <" << primitive << ">.\n" << "[Error info] Number of loop with iter_var which is " << old_var->name << " should be 1 in Exprs[0] of module_expr!\n" - << "[Error info] The Expr of current schedule is: " + << "[Expr info] The Expr of current schedule is: " << module_expr_.GetExprs() << "."; return oss.str(); }())); @@ -475,12 +475,12 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, true); PADDLE_ENFORCE_EQ(find_schedule_block.size(), 1U, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] Number of ScheduleBlockRealize in " "partial_loop should be 1!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); Expr sch_block = (*find_schedule_block.begin()); @@ -491,12 +491,12 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ( !used_target_loops.empty(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Cannot CopyTransformAndLoopInfo since there is no loop " "which uses vars in the new_iter_values in Expr[0] of module_expr!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); Expr res; @@ -521,12 +521,12 @@ void DyScheduleImpl::CopyTransformAndLoopInfo(const Expr& block, PADDLE_ENFORCE_EQ(!all_loops.empty(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] Cannot CopyTransformAndLoopInfo since " "there is no loop in Expr parameter 'block'!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); this->Replace(all_loops[0], res); @@ -554,13 +554,13 @@ Expr DyScheduleImpl::SampleCategorical( PADDLE_ENFORCE_EQ(candidates.size(), probs.size(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] vector params(candidates) and " "vector params(probs) must " "have same size in SampleCategorical!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); int seed_idx = utils::SampleDiscreteFromDistribution(probs, rand_seed); @@ -580,42 +580,42 @@ std::vector DyScheduleImpl::SamplePerfectTile( PADDLE_ENFORCE_NOT_NULL( loop.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr parameter 'loop' should be a For loop.\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_GE(n, 2, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The number of tile factors (n) should be " "at least 2, but got %d.\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", n, module_expr_.GetExprs())); PADDLE_ENFORCE_GE(max_innermost_factor, 1, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The max innermost factor should be at " "least 1, but got %d.\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", max_innermost_factor, module_expr_.GetExprs())); PADDLE_ENFORCE_EQ(cinn::common::is_zero(loop.As()->min), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The For loop should start from 0.\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); int loop_extent = GetLoopExtent(loop); @@ -629,12 +629,12 @@ std::vector DyScheduleImpl::SamplePerfectTile( // 检查是否找到合适的 innermost_factor PADDLE_ENFORCE_EQ(!innermost_factors.empty(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] No innermost factor found for loop " "extent %d with max_innermost_factor %d.\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", loop_extent, max_innermost_factor, module_expr_.GetExprs())); diff --git a/paddle/cinn/ir/schedule/impl/compute_location.cc b/paddle/cinn/ir/schedule/impl/compute_location.cc index 8791dedd77d90..1c4a543a6dac5 100644 --- a/paddle/cinn/ir/schedule/impl/compute_location.cc +++ b/paddle/cinn/ir/schedule/impl/compute_location.cc @@ -44,19 +44,19 @@ void DyScheduleImpl::ComputeAt(const Expr& block, std::ostringstream os; PADDLE_ENFORCE_NOT_NULL( block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(block) should be a ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( loop.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(loop) should be a For node!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); Expr root = this->GetRootBlock(block); @@ -88,19 +88,19 @@ void DyScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) { std::ostringstream os; PADDLE_ENFORCE_NOT_NULL( block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(block) should be a ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_NOT_NULL( loop.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(loop) should be a For node!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); std::vector block_loops = this->GetLoops(block); Expr root = this->GetRootBlock(block); @@ -149,23 +149,23 @@ void DyScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) { PADDLE_ENFORCE_EQ( prove_eq.has_value(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Extent of loop in Expr Param(loop) and extent of " "loop in Expr Param(block) should be equal correspondingly!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); PADDLE_ENFORCE_EQ( prove_eq.value(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Extent of loop in Expr Param(loop) and extent of " "loop in Expr Param(block) should be equal correspondingly!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); if (block_loops[i].As()->bind_info().valid() && !loops[i].As()->bind_info().valid()) { @@ -291,12 +291,12 @@ void DyScheduleImpl::ComputeInline(const Expr& schedule_block) { PADDLE_ENFORCE_NOT_NULL( schedule_block.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(schedule_block) should be a " "ScheduleBlockRealize!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); Expr root = this->GetRootBlock(schedule_block); @@ -306,12 +306,12 @@ void DyScheduleImpl::ComputeInline(const Expr& schedule_block) { PADDLE_ENFORCE_EQ( inliner.BodyPatternAllowInline(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Current IR can't meets the requirements of " "ComputeInline!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); // Create a plan that removes the block to be inlined @@ -342,12 +342,12 @@ void DyScheduleImpl::ReverseComputeInline(const Expr& schedule_block) { PADDLE_ENFORCE_EQ( inliner.BodyPatternAllowInline(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Current IR can't meets the requirements of " "ReverseComputeInline!\n" - "[Error info] The Expr of current schedule is: %s.", + "[Expr info] The Expr of current schedule is: %s.", module_expr_.GetExprs())); // Create a plan that removes the block to be inlined LeafBlockRemovalPlan remove_plan( diff --git a/paddle/cinn/ir/schedule/impl/for_type.cc b/paddle/cinn/ir/schedule/impl/for_type.cc index 252240e214de1..facaefc694858 100644 --- a/paddle/cinn/ir/schedule/impl/for_type.cc +++ b/paddle/cinn/ir/schedule/impl/for_type.cc @@ -46,7 +46,7 @@ void DyScheduleImpl::MutateForType(const Expr& loop, auto* for_node = loop.As(); PADDLE_ENFORCE_NOT_NULL( for_node, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Loop parameter should be For nod3!\n" @@ -92,7 +92,7 @@ void DyScheduleImpl::Vectorize(const Expr& loop, int factor) { PADDLE_ENFORCE_GT(factor, 0, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] Vectorize factor should be more than 0.\n" @@ -102,7 +102,7 @@ void DyScheduleImpl::Vectorize(const Expr& loop, int factor) { PADDLE_ENFORCE_EQ( loop.As()->extent.is_constant(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The loop to be vectorized should be constant!\n" @@ -121,7 +121,7 @@ void DyScheduleImpl::Unroll(const Expr& loop) { PADDLE_ENFORCE_EQ( loop.As()->extent.is_constant(), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The loop to be unrolled should be constant!\n" @@ -149,7 +149,7 @@ void DyScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) { PADDLE_ENFORCE_EQ( thread_axes.count(thread_axis), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The thread_axis which is %s is not supported\n" @@ -166,7 +166,7 @@ void DyScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) { if (thread_axis[0] == 'b') { PADDLE_ENFORCE_EQ(check_offset(thread_axis[0]), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the " "schedule primitive .\n" "[Error info] Invalid Bind! The extent of loop is " @@ -178,7 +178,7 @@ void DyScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) { } else { PADDLE_ENFORCE_EQ(check_offset(thread_axis[0]), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the " "schedule primitive .\n" "[Error info] Invalid Bind! The extent of loop is " diff --git a/paddle/cinn/ir/schedule/impl/loop_transformation.cc b/paddle/cinn/ir/schedule/impl/loop_transformation.cc index c44d69c246817..9a742434a8c09 100644 --- a/paddle/cinn/ir/schedule/impl/loop_transformation.cc +++ b/paddle/cinn/ir/schedule/impl/loop_transformation.cc @@ -46,7 +46,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_NOT_NULL( loop.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(loop) must be For node!\n" @@ -58,7 +58,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( cinn::common::is_zero(for_node->min), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The For node must start with 0!\n" @@ -68,7 +68,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( factors.empty(), false, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The factors param of Split should not be empty!\n" @@ -162,7 +162,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_LE( num_minus1, 1, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The params in factors of Split on dynamic shape should " @@ -173,7 +173,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( is_positive, true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The params in factors of Split on dynamic shape should " @@ -226,7 +226,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_NOT_NULL( loop.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Expr param(loop) must be For node!\n" @@ -238,7 +238,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( common::is_zero(for_node->min), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The For node must start with 0!\n" @@ -248,7 +248,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( factors.empty(), false, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The factors param of Split should not be empty!" @@ -258,7 +258,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( loop.As()->extent.is_constant(), false, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Can't Split a loop with constant extent but with " @@ -282,7 +282,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, PADDLE_ENFORCE_EQ( analyzer.ProveEQ(tot_extent, prod_size).value_or(false), true, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Product of factors can't be proved to be equal to the " @@ -333,7 +333,7 @@ Expr DyScheduleImpl::Fuse(const std::vector& loops) { PADDLE_ENFORCE_EQ( loops.empty(), false, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The loops param of Fuse should not be empty!\n" @@ -343,7 +343,7 @@ Expr DyScheduleImpl::Fuse(const std::vector& loops) { for (const Expr& it_loop : loops) { PADDLE_ENFORCE_NOT_NULL( it_loop.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Loop in vector param(loops) of Fuse must be " @@ -354,7 +354,7 @@ Expr DyScheduleImpl::Fuse(const std::vector& loops) { if (!for_nodes.empty()) { PADDLE_ENFORCE_NOT_NULL( for_nodes.back()->body.As(), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The body of for node is not Block!\n" @@ -364,7 +364,7 @@ Expr DyScheduleImpl::Fuse(const std::vector& loops) { PADDLE_ENFORCE_EQ( for_nodes.back()->body.As()->stmts.size(), 1, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The Block's size of for node is not 1!\n" @@ -374,7 +374,7 @@ Expr DyScheduleImpl::Fuse(const std::vector& loops) { PADDLE_ENFORCE_EQ( for_nodes.back()->body.As()->stmts[0], it_loop, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The For nodes in loops param of Fuse must be " @@ -437,7 +437,7 @@ Expr DyScheduleImpl::Fuse(const std::string& block_name, PADDLE_ENFORCE_EQ( loops_index[i - 1] + 1, loops_index[i], - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Loops index in Fuse should be continuous!\n" @@ -448,7 +448,7 @@ Expr DyScheduleImpl::Fuse(const std::string& block_name, for (int i : loops_index) { PADDLE_ENFORCE_LT(i, static_cast(all_loops.size()), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The loop index in Fuse should be less " @@ -459,7 +459,7 @@ Expr DyScheduleImpl::Fuse(const std::string& block_name, PADDLE_ENFORCE_GE( i, 0, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The loop index in Fuse should be >= 0!\n" @@ -485,7 +485,7 @@ Expr DyScheduleImpl::Fuse(const Expr& block, PADDLE_ENFORCE_EQ( loops_index[i - 1] + 1, loops_index[i], - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] Loops index in Fuse should be continuous!\n" @@ -496,7 +496,7 @@ Expr DyScheduleImpl::Fuse(const Expr& block, for (int i : loops_index) { PADDLE_ENFORCE_LT(i, static_cast(all_loops.size()), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The loop index in Fuse should be less " @@ -506,7 +506,7 @@ Expr DyScheduleImpl::Fuse(const Expr& block, PADDLE_ENFORCE_GT(i, 0, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The loop index in Fuse should be > 0!\n" @@ -554,7 +554,7 @@ Expr DyScheduleImpl::Reorder(const std::string& block_name, for (int i : loops_index) { PADDLE_ENFORCE_LT(i, static_cast(all_loops.size()), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The loop index in Reorder should be " @@ -565,7 +565,7 @@ Expr DyScheduleImpl::Reorder(const std::string& block_name, PADDLE_ENFORCE_GE( i, 0, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The loop index in Reorder should be >= 0!\n" @@ -590,7 +590,7 @@ Expr DyScheduleImpl::Reorder(const Expr& block, for (int i : loops_index) { PADDLE_ENFORCE_LT(i, static_cast(all_loops.size()), - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule " "primitive .\n" "[Error info] The loop index in Reorder should be " @@ -601,7 +601,7 @@ Expr DyScheduleImpl::Reorder(const Expr& block, PADDLE_ENFORCE_GE( i, 0, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An error occurred in the schedule primitive " ".\n" "[Error info] The loop index in Reorder should be >= 0!\n" diff --git a/paddle/cinn/ir/schedule/impl/reduction.cc b/paddle/cinn/ir/schedule/impl/reduction.cc index 26ae22a8fed71..e9df0c7520fa4 100644 --- a/paddle/cinn/ir/schedule/impl/reduction.cc +++ b/paddle/cinn/ir/schedule/impl/reduction.cc @@ -62,12 +62,12 @@ Expr DyScheduleImpl::FactorizeReduction(const Expr& rf_loop, PADDLE_ENFORCE_EQ( blocks.size(), 1, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An Error occurred in the schedule primite <%s>.\n" "[Error info] The rf_loop is required to have only one child block, " "but got %d!\n" - "[Error info] The Expr of current schedule is: %s.", - primitive, + "[Expr info] The Expr of current schedule is: %s.", + primitive.c_str(), blocks.size(), module_expr_.GetExprs())); Expr original_block = blocks.at(0); @@ -80,12 +80,12 @@ Expr DyScheduleImpl::FactorizeReduction(const Expr& rf_loop, PADDLE_ENFORCE_GT( original_loops.size(), 0, - phi::errors::InvalidArgument( + ::common::errors::InvalidArgument( "[IRScheduleError] An Error occurred in the schedule primite <%s>.\n" "[Error info] The size of original_loops should be great than 0, but " "got %d!\n" - "[Error info] The Expr of current schedule is: %s.", - primitive, + "[Expr info] The Expr of current schedule is: %s.", + primitive.c_str(), original_loops.size(), module_expr_.GetExprs())); VLOG(3) << "before FactorizeReduction, original computational body of the " diff --git a/paddle/cinn/ir/schedule/impl/storage.cc b/paddle/cinn/ir/schedule/impl/storage.cc index 8f701ac90cf2e..055cf0fdeb452 100644 --- a/paddle/cinn/ir/schedule/impl/storage.cc +++ b/paddle/cinn/ir/schedule/impl/storage.cc @@ -43,12 +43,13 @@ Expr DyScheduleImpl::CacheRead(const Expr& block, std::string primitive = "CacheRead"; PADDLE_ENFORCE_NOT_NULL( - block.As(), phi::errors::InvalidArgument([&]() { + block.As(), + ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] Expr param(block) is not a ScheduleBlockRealize!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -58,12 +59,13 @@ Expr DyScheduleImpl::CacheRead(const Expr& block, Expr read_expr = GetNthAccessExpr(block, read_buffer_index, false); PADDLE_ENFORCE_NOT_NULL( - block.As(), phi::errors::InvalidArgument([&]() { + block.As(), + ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] The read_expr is not a Load!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -96,12 +98,13 @@ Expr DyScheduleImpl::CacheWrite(const Expr& block, std::string primitive = "CacheWrite"; PADDLE_ENFORCE_NOT_NULL( - block.As(), phi::errors::InvalidArgument([&]() { + block.As(), + ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] Expr param(block) is not a ScheduleBlockRealize!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -111,12 +114,12 @@ Expr DyScheduleImpl::CacheWrite(const Expr& block, Expr write_expr = GetNthAccessExpr(block, write_buffer_index, true); PADDLE_ENFORCE_NOT_NULL( - write_expr.As(), phi::errors::InvalidArgument([&]() { + write_expr.As(), ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] The write_expr is not a Store!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -152,13 +155,13 @@ Expr DyScheduleImpl::CacheWrite(const Expr& block, PADDLE_ENFORCE_EQ( info.write_tensor->buffer.defined(), true, - phi::errors::InvalidArgument([&]() { + ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] The buffer of current write_tensor is not " "defined!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -177,12 +180,12 @@ Expr DyScheduleImpl::CacheWrite(const Expr& block, } } PADDLE_ENFORCE_EQ( - find_cache_block.size(), 1U, phi::errors::InvalidArgument([&]() { + find_cache_block.size(), 1U, ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] Size of find_cache_block is not 1!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -198,13 +201,13 @@ void DyScheduleImpl::SyncThreads(const Expr& ir_node, bool after_node) { PADDLE_ENFORCE_EQ( ir_node.As() || ir_node.As(), true, - phi::errors::InvalidArgument([&]() { + ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] Expr param(ir_node) should be a " "ScheduleBlockRealize or For!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -223,12 +226,13 @@ void DyScheduleImpl::SetBuffer(Expr& block, // NOLINT CINN_IR_SCHEDULE_BEGIN(); std::string primitive = "SetBuffer"; PADDLE_ENFORCE_NOT_NULL( - block.As(), phi::errors::InvalidArgument([&]() { + block.As(), + ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] Expr param(block) is not a ScheduleBlockRealize!\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); @@ -237,13 +241,13 @@ void DyScheduleImpl::SetBuffer(Expr& block, // NOLINT block, [&](const Expr* x) { return x->As(); }, true); PADDLE_ENFORCE_EQ( - find_tensor.size(), 1U, phi::errors::InvalidArgument([&]() { + find_tensor.size(), 1U, ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An error occurred in the schedule primitive <" << primitive << ">.\n" << "[Error info] One block should only have one Store node!(except " "for root block)\n" - << "[Error info] The Expr of current schedule is " + << "[Expr info] The Expr of current schedule is " << module_expr_.GetExprs() << "."; return os.str(); }())); diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc index b4337051a1c87..107862e280155 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.cc +++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc @@ -306,7 +306,7 @@ std::vector ValidateFactors(const std::vector& factors, for (auto& i : factors) { idx++; PADDLE_ENFORCE_EQ( - i != 0 && i >= -1, true, phi::errors::InvalidArgument([&]() { + i != 0 && i >= -1, true, ::common::errors::InvalidArgument([&]() { std::ostringstream os; os << "[IRScheduleError] An Error occurred in the schedule primitive " "<" @@ -318,19 +318,20 @@ std::vector ValidateFactors(const std::vector& factors, << module_expr.GetExprs() << "."; return os.str(); }())); - PADDLE_ENFORCE_EQ( - i == -1 && has_minus_one, false, phi::errors::InvalidArgument([&]() { - std::ostringstream os; - os << "[IRScheduleError] An Error occurred in the " - "schedule primitive <" - << primitive << ">.\n" - << "[Error info] The params in factors of Split " - "should not be less than -1 or " - << "have more than one -1!\n" - << "[Expr info] The Expr of current schedule is " - << module_expr.GetExprs() << "."; - return os.str(); - }())); + PADDLE_ENFORCE_EQ(i == -1 && has_minus_one, + false, + ::common::errors::InvalidArgument([&]() { + std::ostringstream os; + os << "[IRScheduleError] An Error occurred in the " + "schedule primitive <" + << primitive << ">.\n" + << "[Error info] The params in factors of Split " + "should not be less than -1 or " + << "have more than one -1!\n" + << "[Expr info] The Expr of current schedule is " + << module_expr.GetExprs() << "."; + return os.str(); + }())); if (i == -1) { has_minus_one = true; } else { @@ -340,7 +341,7 @@ std::vector ValidateFactors(const std::vector& factors, std::vector validated_factors = factors; if (!has_minus_one) { PADDLE_ENFORCE_GE( - product, total_extent, phi::errors::PreconditionNotMet([&]() { + product, total_extent, ::common::errors::PreconditionNotMet([&]() { std::ostringstream os; os << "[IRScheduleError] An Error occurred in the schedule primitive " "<" diff --git a/paddle/cinn/operator_fusion/fusion_tracker/expr_utils.cc b/paddle/cinn/operator_fusion/fusion_tracker/expr_utils.cc index dc1a9aa018002..baf15ad7b353f 100644 --- a/paddle/cinn/operator_fusion/fusion_tracker/expr_utils.cc +++ b/paddle/cinn/operator_fusion/fusion_tracker/expr_utils.cc @@ -128,7 +128,11 @@ std::vector TopoSort(const std::vector& op_exprs) { } } } - CHECK_EQ(result.size(), op_exprs.size()); + PADDLE_ENFORCE_EQ(result.size(), + op_exprs.size(), + ::common::errors::PreconditionNotMet( + "[Error info] the size of result should be equal to " + "the size of op_exprs.")); std::vector sorted_result; for (const auto& op : result) { sorted_result.push_back(*op); diff --git a/paddle/cinn/operator_fusion/utils.cc b/paddle/cinn/operator_fusion/utils.cc index 81377df831c52..c3967496f54bc 100644 --- a/paddle/cinn/operator_fusion/utils.cc +++ b/paddle/cinn/operator_fusion/utils.cc @@ -155,7 +155,11 @@ std::vector> GetNonBroadCastDims(pir::Operation* op) { const auto& [input_value, output_value] = broad_cast_value.value(); const int input_rank = GetRank(input_value); const int output_rank = GetRank(output_value); - CHECK_GE(output_rank, input_rank); + PADDLE_ENFORCE_GE(output_rank, + input_rank, + ::common::errors::PreconditionNotMet( + "[Error info] The ouput_rank should " + "be greater or equal to input_rank.")); // Compare axis one by one, from back to front. // The rule of broadcasting: diff --git a/paddle/fluid/framework/barrier.h b/paddle/fluid/framework/barrier.h index 8a3d8c28e2b49..ba5fd50ddfa82 100644 --- a/paddle/fluid/framework/barrier.h +++ b/paddle/fluid/framework/barrier.h @@ -50,10 +50,17 @@ class Barrier { "Fail to initialize the barrier with error code %d.", ret)); #endif } - ~Barrier() { + ~Barrier() noexcept(false) { #ifdef _LINUX int ret = pthread_barrier_destroy(&_barrier); - CHECK_EQ(0, ret); + PADDLE_ENFORCE_EQ( + 0, + ret, + common::errors::PreconditionNotMet( + "[error info] the result of " + "pthread_barrier_destroy(&_barrier) should be zero.\n " + "[result info] The value of current result is %d.", + ret)); #endif } void reset(int count) { @@ -125,10 +132,16 @@ class Semaphore { "Fail to initialize the semaphore with error code %d.", ret)); #endif } - ~Semaphore() { + ~Semaphore() noexcept(false) { #ifdef _LINUX int ret = sem_destroy(&_sem); - CHECK_EQ(0, ret); + PADDLE_ENFORCE_EQ( + 0, + ret, + common::errors::PreconditionNotMet( + "[error info] the result of sem_destroy(&_sem) should be zero.\n" + "[result info] The value of current result is %d.", + ret)); #endif } void post() { diff --git a/paddle/fluid/framework/new_executor/instruction/tensorrt_engine_instruction.cc b/paddle/fluid/framework/new_executor/instruction/tensorrt_engine_instruction.cc index 0095788b6c1d4..e986bce6f2d84 100644 --- a/paddle/fluid/framework/new_executor/instruction/tensorrt_engine_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/tensorrt_engine_instruction.cc @@ -466,8 +466,8 @@ void TensorRTEngineInstruction::BindInputTensor( PADDLE_ENFORCE_GE( bind_index, 0, - phi::errors::InvalidArgument("Cannot find input name %s in TRT engine", - input_name.c_str())); + common::errors::InvalidArgument("Cannot find input name %s in TRT engine", + input_name.c_str())); #else const int bind_index = trt_engine_->engine()->getBindingIndex(input_name.c_str()) + @@ -640,8 +640,8 @@ void TensorRTEngineInstruction::BindOutputTensor( PADDLE_ENFORCE_GE( bind_index, 0, - phi::errors::InvalidArgument("Cannot find input name %s in TRT engine", - output_name.c_str())); + common::errors::InvalidArgument("Cannot find input name %s in TRT engine", + output_name.c_str())); #else const int bind_index = trt_engine_->engine()->getBindingIndex(output_name.c_str()) + diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 48599fb4e18e7..20cd5baa22e76 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -873,7 +873,7 @@ phi::DenseTensor from_blob(void* data, const phi::Place& place, const Deleter& deleter) { PADDLE_ENFORCE_NOT_NULL( - data, phi::errors::InvalidArgument("data can not be nullptr.")); + data, common::errors::InvalidArgument("data can not be nullptr.")); auto meta = phi::DenseTensorMeta(dtype, shape, strides); size_t size = SizeOf(dtype) * (meta.is_scalar ? 1 : product(meta.dims)); @@ -912,7 +912,7 @@ phi::DenseTensor TensorFromDLPack(DLManagedTensor* src, Deleter deleter) { } else if (src->dl_tensor.device.device_type == kDLCUDAHost) { place = phi::GPUPinnedPlace(); } else { - PADDLE_THROW(phi::errors::Unimplemented("Given Place is not supported")); + PADDLE_THROW(common::errors::Unimplemented("Given Place is not supported")); } ::DLDataType type = src->dl_tensor.dtype; diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 5a254dd84b2a0..748fa8a252ed9 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -711,7 +711,7 @@ bool BoxCoderOpInferSymbolicShape( PADDLE_ENFORCE_EQ(prior_box_shape.size(), 2, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The rank of Input PriorBox in BoxCoder operator " "must be 2. But received rank = %d", prior_box_shape.size())); @@ -725,7 +725,7 @@ bool BoxCoderOpInferSymbolicShape( PADDLE_ENFORCE_EQ(prior_box_var_shape.size(), 2, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The rank of Input(PriorBoxVar) in BoxCoder operator " "should be 2. But received rank = %d", prior_box_var_shape.size())); @@ -738,7 +738,7 @@ bool BoxCoderOpInferSymbolicShape( if (code_type == "encode_center_size") { PADDLE_ENFORCE_EQ(target_box_shape.size(), 2, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The rank of Input TargetBox in BoxCoder operator " "must be 2. But received rank is %d", target_box_shape.size())); @@ -751,13 +751,13 @@ bool BoxCoderOpInferSymbolicShape( } else if (code_type == "decode_center_size") { PADDLE_ENFORCE_EQ(target_box_shape.size(), 3, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The rank of Input TargetBox in BoxCoder operator " "must be 3. But received rank is %d", target_box_shape.size())); PADDLE_ENFORCE_EQ(axis == 0 || axis == 1, true, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "axis in BoxCoder operator must be 0 or 1. " "But received axis = %d", axis)); @@ -2286,7 +2286,7 @@ bool MaskedMultiheadAttention_OpInferSymbolicShape( PADDLE_ENFORCE_EQ( cache_kv_shape.size(), 5, - phi::errors::InvalidArgument("The cache_kv must be 5 dims.")); + common::errors::InvalidArgument("The cache_kv must be 5 dims.")); infer_context->AddEqualCstr(cache_kv_shape[0], symbol::DimExpr(2)); // TODO(Luohongzhige, Buaa): add constrain for the num_head and k_num_head @@ -2400,7 +2400,7 @@ bool NllLossOpInferSymbolicShape( const std::vector &label_shape = label_shape_or_data.shape(); PADDLE_ENFORCE_EQ(x_shape.size() == 2 || x_shape.size() == 4, true, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The tensor rank of Input(X) must be 2 or 4.")); infer_context->AddEqualCstr(x_shape[0], label_shape[0]); @@ -2408,10 +2408,10 @@ bool NllLossOpInferSymbolicShape( const symbol::ShapeOrDataDimExprs &w_shape_or_data = infer_context->GetShapeOrDataForValue(op->operand_source(2)); const std::vector &w_shape = w_shape_or_data.shape(); - PADDLE_ENFORCE_EQ( - w_shape.size(), - 1, - phi::errors::InvalidArgument("Input(Weight) should be a 1D tensor.")); + PADDLE_ENFORCE_EQ(w_shape.size(), + 1, + common::errors::InvalidArgument( + "Input(Weight) should be a 1D tensor.")); infer_context->AddEqualCstr(x_shape[1], w_shape[0]); } @@ -2433,7 +2433,7 @@ bool NllLossOpInferSymbolicShape( } else if (x_shape.size() == 4) { PADDLE_ENFORCE_EQ(label_shape.size(), 3, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Expected Input(Label) dimensions=3, received %d.", label_shape.size())); @@ -3638,7 +3638,7 @@ bool WeightedSampleNeighborsOpInferSymbolicShape( PADDLE_ENFORCE_EQ( input_shape.size(), 1, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The %s should be 1D, when it is not 2D, but we get %d", tensor_name, input_shape.size())); diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc index 2ee8756afacb2..33eae43d636c4 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc @@ -793,7 +793,7 @@ bool CropOpInferSymbolicShape(pir::Operation *op, PADDLE_ENFORCE_EQ(in_shape.size(), x_shape.size(), - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The number of elements (%d) of attribute 'shape' for " "CropTensor must be equal to the number of " "dimensions (%d) of the input.", @@ -802,7 +802,7 @@ bool CropOpInferSymbolicShape(pir::Operation *op, PADDLE_ENFORCE_EQ( offsets.size(), x_shape.size(), - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "The number of elements (%d) of attribute 'offsets' for " "CropTensor must be equal to the number of " "dimensions (%d) of the input.", @@ -2586,7 +2586,7 @@ bool PartialSumOpInferSymbolicShape( int inputs_num = xs_shapes.size(); PADDLE_ENFORCE_GT(inputs_num, 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: Input tensors count should > 0. But " "received inputs' length is 0.")); if (inputs_num == 1) { @@ -2598,10 +2598,10 @@ bool PartialSumOpInferSymbolicShape( for (int i = 0; i < inputs_num; i++) { const std::vector x_shape = xs_shapes[i].shape(); - PADDLE_ENFORCE_EQ( - x_shape.size(), - 2, - phi::errors::InvalidArgument("Only support two dimensions input now.")); + PADDLE_ENFORCE_EQ(x_shape.size(), + 2, + common::errors::InvalidArgument( + "Only support two dimensions input now.")); if (i > 0) { infer_context->AddEqualCstr(x_shape[0], batch_size); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c2bb06c04f655..d180413a62703 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1266,7 +1266,7 @@ PYBIND11_MODULE(libpaddle, m) { PADDLE_ENFORCE_NOT_NULL( dlMTensor, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "from_dlpack received an invalid capsule. " "Note that DLTensor capsules can be consumed only once, " "so you might have already constructed a tensor from it once.")); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index ea117dc79c5cd..37e54c33ddb3d 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -2804,7 +2804,7 @@ void LookupTableInferMeta(const MetaTensor& w, PADDLE_ENFORCE_EQ( table_dims.size(), 2, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: The dimensions of the 'lookup table' must be 2. " "But received lookup table's dimensions = %d, " "lookup table's shape = [%s].", @@ -2813,7 +2813,7 @@ void LookupTableInferMeta(const MetaTensor& w, PADDLE_ENFORCE_EQ( ids_dims[ids_rank - 1], 1, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: The last dimensions of the 'Ids' tensor must be 1. " "But received Ids's last dimensions = %d, Ids's shape = [%s].", ids_dims[ids_rank - 1], diff --git a/paddle/phi/infermeta/spmd_rules/pad.cc b/paddle/phi/infermeta/spmd_rules/pad.cc index 02528ef49c410..96dcc861bf22b 100644 --- a/paddle/phi/infermeta/spmd_rules/pad.cc +++ b/paddle/phi/infermeta/spmd_rules/pad.cc @@ -37,10 +37,10 @@ SpmdInfo PadInferSpmd(const DistMetaTensor& x, PADDLE_ENFORCE_EQ( x_ndim, x_dims_mapping.size(), - phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's " - "dims_mapping size [%d] are not matched.", - x_ndim, - x_dims_mapping.size())); + common::errors::InvalidArgument("The Tensor X's rank [%d] and X's " + "dims_mapping size [%d] are not matched.", + x_ndim, + x_dims_mapping.size())); std::vector dims_to_unshard; for (size_t i = 0; i < paddings.size(); i += 2) { if (paddings[i] != 0 || paddings[i + 1] != 0) { @@ -70,10 +70,10 @@ SpmdInfo PadGradInferSpmd(const DistMetaTensor& x, PADDLE_ENFORCE_EQ( out_ndim, out_dims_mapping.size(), - phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's " - "dims_mapping size [%d] are not matched.", - out_ndim, - out_dims_mapping.size())); + common::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's " + "dims_mapping size [%d] are not matched.", + out_ndim, + out_dims_mapping.size())); std::vector dims_to_unshard; for (size_t i = 0; i < paddings.size(); i += 2) { diff --git a/paddle/phi/kernels/cpu/lookup_table_grad_kernel.cc b/paddle/phi/kernels/cpu/lookup_table_grad_kernel.cc index af8de75df1764..c5e0dd4fe66e3 100644 --- a/paddle/phi/kernels/cpu/lookup_table_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/lookup_table_grad_kernel.cc @@ -74,7 +74,7 @@ void LookupTableGradKernel(const Context &dev_ctx, PADDLE_ENFORCE_LT( ids_data[i], N, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0 and < %ld, but got %ld. Please check input " "value.", @@ -83,7 +83,7 @@ void LookupTableGradKernel(const Context &dev_ctx, PADDLE_ENFORCE_GE( ids_data[i], 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0 and < %ld, but got %ld. Please check input" "value.", @@ -146,7 +146,7 @@ void LookupTableSparseGradKernel( common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output_dims_2d, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: The shape of lookup_table@Grad and " "output@Grad should be same. " "But received lookup_table@Grad's shape = [%s], " diff --git a/paddle/phi/kernels/cpu/lookup_table_kernel.cc b/paddle/phi/kernels/cpu/lookup_table_kernel.cc index 7aef664615c67..8dbb125208170 100644 --- a/paddle/phi/kernels/cpu/lookup_table_kernel.cc +++ b/paddle/phi/kernels/cpu/lookup_table_kernel.cc @@ -66,7 +66,7 @@ void LookupTableKernel(const Context &dev_ctx, PADDLE_ENFORCE_LT( ids[i], row_number, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0 and < %ld, but got %ld. Please check input " "value.", @@ -75,7 +75,7 @@ void LookupTableKernel(const Context &dev_ctx, PADDLE_ENFORCE_GE( ids[i], 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0 and < %ld, but got %ld. Please check input " "value.", diff --git a/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu b/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu index 39fdbb1370a79..7b0f6aff3ffea 100644 --- a/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu @@ -167,7 +167,7 @@ void LookupTableSparseGradCUDAKernel( common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output_dims_2d, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: The shape of lookup_table@Grad and " "output@Grad should be same. " "But received lookup_table@Grad's shape = [%s], " diff --git a/paddle/phi/kernels/selected_rows/cpu/lookup_table_grad_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/lookup_table_grad_kernel.cc index ebc402551c234..b3f9919fcbe7c 100644 --- a/paddle/phi/kernels/selected_rows/cpu/lookup_table_grad_kernel.cc +++ b/paddle/phi/kernels/selected_rows/cpu/lookup_table_grad_kernel.cc @@ -70,7 +70,7 @@ void LookupTableGradKernel(const Context &dev_ctx, PADDLE_ENFORCE_LT( ids_data[i], N, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0 and < %ld, but got %ld. Please check input " "value.", @@ -79,7 +79,7 @@ void LookupTableGradKernel(const Context &dev_ctx, PADDLE_ENFORCE_GE( ids_data[i], 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0 and < %ld, but got %ld. Please check input" "value.", @@ -134,7 +134,7 @@ void LookupTableSparseGradKernel(const Context &dev_ctx, common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output_dims_2d, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: The shape of lookup_table@Grad and " "output@Grad should be same. " "But received lookup_table@Grad's shape = [%s], " diff --git a/paddle/phi/kernels/selected_rows/cpu/lookup_table_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/lookup_table_kernel.cc index 42ec9481d4e33..9045340474801 100644 --- a/paddle/phi/kernels/selected_rows/cpu/lookup_table_kernel.cc +++ b/paddle/phi/kernels/selected_rows/cpu/lookup_table_kernel.cc @@ -67,7 +67,7 @@ void LookupTableKernel(const Context &dev_ctx, PADDLE_ENFORCE_GE( ids[i], 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0. But received %ld", ids[i])); @@ -95,14 +95,14 @@ void LookupTableKernel(const Context &dev_ctx, PADDLE_ENFORCE_GE( ids[i], 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "Variable value (input) of OP(fluid.layers.embedding) " "expected >= 0. But received %ld", ids[i])); PADDLE_ENFORCE_GE( id_index, 0, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "the input key should be exists. But received %d.", id_index)); if (input_data_type == phi::DataType::INT8 || diff --git a/paddle/phi/kernels/selected_rows/gpu/lookup_table_grad_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/lookup_table_grad_kernel.cu index c8dc2aaeb23b7..bb4c3f0551e99 100644 --- a/paddle/phi/kernels/selected_rows/gpu/lookup_table_grad_kernel.cu +++ b/paddle/phi/kernels/selected_rows/gpu/lookup_table_grad_kernel.cu @@ -167,7 +167,7 @@ void LookupTableSparseGradCUDAKernel( common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output_dims_2d, - phi::errors::InvalidArgument( + common::errors::InvalidArgument( "ShapeError: The shape of lookup_table@Grad and " "output@Grad should be same. " "But received lookup_table@Grad's shape = [%s], " diff --git a/paddle/phi/kernels/xpu/top_p_sampling_kernel.cc b/paddle/phi/kernels/xpu/top_p_sampling_kernel.cc index 7b51b260cbd8d..1c153b1821874 100644 --- a/paddle/phi/kernels/xpu/top_p_sampling_kernel.cc +++ b/paddle/phi/kernels/xpu/top_p_sampling_kernel.cc @@ -63,7 +63,7 @@ void TopPSamplingKernel(const Context& dev_ctx, PADDLE_ENFORCE_EQ( p_num, bs, - phi::errors::PreconditionNotMet( + common::errors::PreconditionNotMet( "Expected bs == p_num, but got bs=%d, p_num=%d.", bs, p_num)); std::vector infer_seed(bs, random_seed); diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h index 63d521c50c836..0d83878057c26 100644 --- a/test/cpp/inference/api/tester_helper.h +++ b/test/cpp/inference/api/tester_helper.h @@ -130,9 +130,21 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { void CheckError(float data_ref, float data) { if (std::abs(data_ref) > 1) { - CHECK_LE(std::abs((data_ref - data) / data_ref), FLAGS_accuracy); + PADDLE_ENFORCE_LE( + std::abs((data_ref - data) / data_ref), + FLAGS_accuracy, + common::errors::InvalidArgument( + "[Error info] abs((data_ref - data) / data_ref) must be less than " + "or equal to FLAGS_accuracy.\n" + "[Argument info] Please check your input data_ref and data.")); } else { - CHECK_LE(std::abs(data_ref - data), FLAGS_accuracy); + PADDLE_ENFORCE_LE( + std::abs(data_ref - data), + FLAGS_accuracy, + common::errors::InvalidArgument( + "[Error info] abs(data_ref - data) must be less than or equal to " + "FLAGS_accuracy.\n" + "[Argument info] Please check your input data_ref and data.")); } } @@ -669,7 +681,12 @@ void SummarizeAccuracy(float avg_acc_ref, float avg_acc, int compared_idx) { } void SummarizePerformance(const char *title, float sample) { - CHECK_GT(sample, 0.0); + PADDLE_ENFORCE_GT(sample, + 0.0, + common::errors::InvalidArgument( + "[Error info] sample must be greater than 0.0\n" + "[Argument info] The current sample is %f.", + sample)); auto throughput = 1000.0 / sample; LOG(INFO) << title << ": avg fps: " << std::fixed << std::setw(6) << std::setprecision(4) << throughput << ", avg latency: " << sample @@ -757,14 +774,35 @@ void CompareAccuracy( SummarizeAccuracy(avg_acc_ref, avg_acc_quant, compared_idx); - if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0); + if (FLAGS_enable_fp32) { + PADDLE_ENFORCE_GT(avg_acc_ref, + 0.0, + common::errors::PreconditionNotMet( + "[Error info] avg_acc_ref must be greater than 0.0.\n" + "[Condition info] The current avg_acc_ref is %f.", + avg_acc_ref)); + } - if (FLAGS_enable_int8_ptq || FLAGS_enable_int8_qat || FLAGS_enable_bf16) - CHECK_GT(avg_acc_quant, 0.0); + if (FLAGS_enable_int8_ptq || FLAGS_enable_int8_qat || FLAGS_enable_bf16) { + PADDLE_ENFORCE_GT( + avg_acc_quant, + 0.0, + common::errors::PreconditionNotMet( + "[Error info] avg_acc_quant must be greater than 0.0.\n" + "[Condition info] The current avg_acc_quant is %f.", + avg_acc_quant)); + } if (FLAGS_enable_fp32 && - (FLAGS_enable_int8_ptq || FLAGS_enable_int8_qat || FLAGS_enable_bf16)) - CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy); + (FLAGS_enable_int8_ptq || FLAGS_enable_int8_qat || FLAGS_enable_bf16)) { + PADDLE_ENFORCE_LE( + avg_acc_ref - avg_acc_quant, + FLAGS_quantized_accuracy, + common::errors::PreconditionNotMet( + "[Error info] avg_acc_ref - avg_acc_quant must be less than or " + "equal to FLAGS_quantized_accuracy.\n" + "[Condition info] Please check your input data.")); + } // test } void CompareDeterministic( From b8aa4a72c18a5e48320ac440e97a59ad2bcde5ad Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Tue, 8 Oct 2024 20:40:02 +0800 Subject: [PATCH 031/135] fix the performance descent caused by pr68140 (#68488) --- python/paddle/base/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index 44869136258b6..fec12eb1eb97a 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -629,7 +629,7 @@ def _to_str(var): elif isinstance(var, Operator): return str(id(var)) elif isinstance(var, Value): - return str(id(var)) + return str(var.id) else: raise TypeError(str(var) + " should be Variable, Operator or str") From 078bc033d3d02a78322379eccbb9bacc29f210cf Mon Sep 17 00:00:00 2001 From: Xinyi Li Date: Wed, 9 Oct 2024 09:25:36 +0800 Subject: [PATCH 032/135] delete extra placement (#68494) --- paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc index 7a7ea92c727de..0dcc8aa77f60b 100644 --- a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc +++ b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc @@ -189,7 +189,6 @@ class OneDNNPlacementPass : public pir::PatternRewritePass { patternCreator.CreatePatterns(ps); patternCreator.CreatePatterns(ps); patternCreator.CreatePatterns(ps); - patternCreator.CreatePatterns(ps); patternCreator.CreatePatterns(ps); patternCreator.CreatePatterns(ps); patternCreator.CreatePatterns(ps); From c104a7c9847604a6140fb823e4415d7518d30f07 Mon Sep 17 00:00:00 2001 From: Shuhao Liang <50269654+lshpku@users.noreply.github.com> Date: Wed, 9 Oct 2024 10:20:06 +0800 Subject: [PATCH 033/135] [CINN] Add method to check applicability of GridReduce (#68338) --- .../hlir/framework/pir/trivial_op_impl.cc | 5 + .../cinn/hlir/framework/pir/trivial_op_impl.h | 12 +- .../ir/group_schedule/config/CMakeLists.txt | 1 + .../group_schedule/config/group_tile_util.cc | 112 ++++++++++++++++++ .../group_schedule/config/group_tile_util.h | 28 +++++ 5 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 paddle/cinn/ir/group_schedule/config/group_tile_util.cc create mode 100644 paddle/cinn/ir/group_schedule/config/group_tile_util.h diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc index 5ce62346d55d9..b42fc1ade3b3e 100644 --- a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc +++ b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc @@ -25,6 +25,7 @@ #include "paddle/cinn/hlir/pe/map_expr_to_ir.h" #include "paddle/cinn/ir/dim.h" #include "paddle/cinn/ir/group_schedule/base_group_scheduler.h" +#include "paddle/cinn/ir/group_schedule/config/group_tile_util.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/schedule/ir_schedule_util.h" #include "paddle/cinn/lang/placeholder.h" @@ -806,6 +807,10 @@ std::shared_ptr GetFusionGroupInfo( } }); } + + group_info->can_apply_grid_reduce = + GetCanApplyGridReduce(op_compute_bodies, group_info->reduce_axis); + VLOG(4) << group_info->DebugPrint(); return group_info; } diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.h b/paddle/cinn/hlir/framework/pir/trivial_op_impl.h index 2f6429dc99552..d8063ef401a47 100644 --- a/paddle/cinn/hlir/framework/pir/trivial_op_impl.h +++ b/paddle/cinn/hlir/framework/pir/trivial_op_impl.h @@ -168,12 +168,16 @@ struct FusionGroupInfo { std::vector loop_strides; std::vector reduce_axis; std::vector reduce_var_name; + bool can_apply_grid_reduce; std::string DebugPrint() { - return "GroupInfo\nloop_ranges: " + cinn::utils::Join(loop_ranges, " ") + - "\nloop_strides: " + cinn::utils::Join(loop_strides, ", ") + - "\nreduce_axis: " + cinn::utils::Join(reduce_axis, " ") + - "\nreduce_var_name: " + cinn::utils::Join(reduce_var_name, " "); + std::stringstream ss; + ss << "GroupInfo\nloop_ranges: " << cinn::utils::Join(loop_ranges, " ") + << "\nloop_strides: " << cinn::utils::Join(loop_strides, ", ") + << "\nreduce_axis: " << cinn::utils::Join(reduce_axis, " ") + << "\nreduce_var_name: " << cinn::utils::Join(reduce_var_name, " ") + << "\ncan_apply_grid_reduce: " << can_apply_grid_reduce; + return ss.str(); } }; diff --git a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt index f0cbf74d8df56..cfef0508780ff 100644 --- a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt +++ b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt @@ -3,6 +3,7 @@ cinn_proto_library(tile_config_proto SRCS tile_config_desc.proto) core_gather_headers() gather_srcs(cinnapi_src SRCS group_tile_config.cc) +gather_srcs(cinnapi_src SRCS group_tile_util.cc) gather_srcs(cinnapi_src SRCS database.cc) gather_srcs(cinnapi_src SRCS file_database.cc) gather_srcs(cinnapi_src SRCS schedule_config_manager.cc) diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_util.cc b/paddle/cinn/ir/group_schedule/config/group_tile_util.cc new file mode 100644 index 0000000000000..91408798582d1 --- /dev/null +++ b/paddle/cinn/ir/group_schedule/config/group_tile_util.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/ir/group_schedule/config/group_tile_util.h" +#include "paddle/cinn/hlir/framework/pir/trivial_op_impl.h" +#include "paddle/cinn/ir/ir_analyzer/ir_analyzer.h" + +namespace cinn { + +using hlir::framework::pir::trivial_fusion_detail::GetAllForIters; +using hlir::framework::pir::trivial_fusion_detail::ExprSetFinderUtils:: + ChildScheduleBlockRealizes; +using hlir::framework::pir::trivial_fusion_detail::ExprSetFinderUtils:: + ChildTensorLoads; +using hlir::framework::pir::trivial_fusion_detail::ExprSetFinderUtils:: + ScheduleBlockRealizeIsNotInit; + +namespace ir { + +bool GetCanApplyGridReduce(const std::vector& op_compute_bodies, + const std::vector& reduce_axis) { + // Names of tensors that are downstream of reduce. + // A tensor is downstream of reduce either if it is produced by a reduce, or + // if it has data dependency on another tensor that is downstream of reduce. + std::unordered_set reduce_downstream_tensor_names; + int reduce_count = 0; + + const auto IsReduceDownstream = [&](const ir::Expr& expr_block) { + for (auto& expr_load : ChildTensorLoads(expr_block)) { + std::string load_tensor_name = expr_load.As()->name(); + if (reduce_downstream_tensor_names.count(load_tensor_name) > 0) { + return true; + } + } + return false; + }; + + const auto AddReduceDownstream = [&](const ir::Expr& expr_block) { + auto expr_store = analyzer::GetStoreOfSBlock(expr_block); + std::string store_tensor_name = expr_store.As()->name(); + reduce_downstream_tensor_names.insert(store_tensor_name); + }; + + const auto CheckOutputHasReduceAxis = [&](const ir::Expr& body, + const ir::Expr& expr_block) { + std::vector all_loop_vars = GetAllForIters(body); + std::unordered_set reduce_loop_vars; + for (int64_t axis : reduce_axis) { + reduce_loop_vars.insert(all_loop_vars[axis]->name); + } + + std::unordered_set reduce_iter_vars; + auto* block = expr_block.As(); + auto& iter_vars = block->schedule_block.As()->iter_vars; + for (int i = 0; i < iter_vars.size(); i++) { + ir::Var loop_var = block->iter_values[i].as_var_ref(); + if (reduce_loop_vars.count(loop_var->name) > 0) { + reduce_iter_vars.insert(iter_vars[i]->name); + } + } + + // The result is true if the indices of the output tensor contain any + // reduce iter vars. + auto expr_store = analyzer::GetStoreOfSBlock(expr_block); + for (auto& index_expr : expr_store.As()->indices) { + if (reduce_iter_vars.count(index_expr.as_var_ref()->name) > 0) { + return true; + } + } + return false; + }; + + for (const auto& body : op_compute_bodies) { + ir::Expr expr_block = + (ChildScheduleBlockRealizes * ScheduleBlockRealizeIsNotInit) + .GetSingle(body); + bool is_reduce = analyzer::IsReductionSBlock(expr_block); + bool is_reduce_downstream = IsReduceDownstream(expr_block); + bool output_has_reduce_axis = CheckOutputHasReduceAxis(body, expr_block); + + if (is_reduce) { + ++reduce_count; + } + if (is_reduce_downstream || is_reduce) { + AddReduceDownstream(expr_block); + } + + // When a block is downstream of reduce, its output shouldn't contain + // reduce axis. Otherwise, it broadcasts the result of reduce. If this + // is the case, we cannot apply grid reduce. + if (is_reduce_downstream && output_has_reduce_axis) { + VLOG(4) << "grid reduce is prohibited by block: " << expr_block; + return false; + } + } + + return reduce_count == 1; +} + +} // namespace ir +} // namespace cinn diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_util.h b/paddle/cinn/ir/group_schedule/config/group_tile_util.h new file mode 100644 index 0000000000000..2fe37e22f0ca3 --- /dev/null +++ b/paddle/cinn/ir/group_schedule/config/group_tile_util.h @@ -0,0 +1,28 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/cinn/ir/ir.h" + +namespace cinn { +namespace ir { + +// Check whether we can apply grid reduce in this group. +// We can apply grid reduce if there is exactly one reduce, and whose result is +// not broadcasted before output. +bool GetCanApplyGridReduce(const std::vector& op_compute_bodies, + const std::vector& reduce_axis); + +} // namespace ir +} // namespace cinn From 042eb6d23b94f35b51f9addacd0a898cf663ae92 Mon Sep 17 00:00:00 2001 From: zhink <33270771+zhink@users.noreply.github.com> Date: Wed, 9 Oct 2024 10:30:38 +0800 Subject: [PATCH 034/135] add pd_op.cast converter for PIR TRT (#68216) * add cast op trt layer * codestyle * codestyle * with add_identity * code style --- python/paddle/tensorrt/impls/manipulation.py | 28 +++++++++++++++ test/tensorrt/test_converter_manipulation.py | 36 ++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py index 556bd48e8549c..6c935e93a9020 100644 --- a/python/paddle/tensorrt/impls/manipulation.py +++ b/python/paddle/tensorrt/impls/manipulation.py @@ -214,6 +214,34 @@ def squeeze_converter(network, paddle_op, inputs): return layer.get_output(0) +@converter_registry.register("pd_op.cast", trt_version="8.x") +@converter_registry.register("pd_op.cast_", trt_version="8.x") +def cast_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + out_dtype = int(paddle_op.attrs().get("dtype")) + # Reference paddle/phi/common/data_type.h enum DataType + if out_dtype == 1: + out_dtype = trt.bool + elif out_dtype == 7: + out_dtype = trt.int32 + elif out_dtype == 9: + out_dtype = trt.int32 + elif out_dtype == 10: + out_dtype = trt.float32 + elif out_dtype == 11: + out_dtype = trt.float32 + elif out_dtype == 15: + out_dtype = trt.float16 + else: + raise RuntimeError( + f"cast converter currently doesn't support dtype: {out_dtype}" + ) + cast_layer = network.add_identity(input_tensor) + cast_layer.set_output_type(0, out_dtype) + cast_layer.get_output(0).dtype = out_dtype + return cast_layer.get_output(0) + + @converter_registry.register("pd_op.slice", trt_version="8.x") def slice_converter(network, paddle_op, inputs): input_tensor = inputs[0] diff --git a/test/tensorrt/test_converter_manipulation.py b/test/tensorrt/test_converter_manipulation.py index 638e9de6b2802..15af4f2e30996 100644 --- a/test/tensorrt/test_converter_manipulation.py +++ b/test/tensorrt/test_converter_manipulation.py @@ -21,6 +21,42 @@ from paddle import _C_ops +class TestCast0TRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.cast + self.api_args = { + "x": np.random.randn(7, 3).astype(np.float32), + "out_dtype": np.bool_, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.max_shape = {"x": [10, 3]} + + +class TestCast1TRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.cast + self.api_args = { + "x": np.random.randn(7, 3).astype(np.float16), + "out_dtype": np.int32, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.max_shape = {"x": [10, 3]} + + +class TestCast2TRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.cast + self.api_args = { + "x": np.random.randn(7, 3).astype(np.float32), + "out_dtype": np.int64, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.max_shape = {"x": [10, 3]} + + class TestConcatTRTPattern(TensorRTBaseTest): def setUp(self): self.python_api = paddle.concat From d72a8527c61b3027f13f4a1e471d0009284d7c80 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Wed, 9 Oct 2024 10:30:50 +0800 Subject: [PATCH 035/135] =?UTF-8?q?=E3=80=90pir=E3=80=91modify=20Decomp=5F?= =?UTF-8?q?gen=20for=20mutable=5Fattr=20(#68539)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modify paddletest * modify paddletest * remove dynamic shape * modify ci * modify mutable attribute gen for decomp --- .../templates/decomp/generated_decomp.j2 | 3 ++- paddle/phi/ops/yaml/op_compat.yaml | 20 +++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2 b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2 index a441f6b1432ad..e45280c683705 100644 --- a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2 +++ b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp.j2 @@ -15,6 +15,7 @@ namespace paddle { namespace dialect { using IntArray = paddle::experimental::IntArray; {% macro sig(fwd_name, class_name, inputs, attrs, outputs) %} + {% set input_names=[] %} {% set attr_names=[] %} {% set output_names=[] %} @@ -72,7 +73,7 @@ std::vector> {{class_name}}::Decomp(pir::Operation* op) {% if attrs %} {% for item in attrs %} {% do attr_names.append(item.name) %} - {% if item.typename == "Scalar" and item.support_tensor %} + {% if item.typename.startswith("Scalar") and item.support_tensor %} Tensor {{item.name}}_(std::make_shared(op_obj.{{item.name}}())); diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index 4b0c6f73c5f3e..fbf5d8f15f2a9 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -3618,22 +3618,22 @@ outputs: out : Out int_array: - sections : - data_type : int - support_tensor : true + sections : + data_type : int + support_tensor : true scalar : - axis : - data_type : int - support_tensor : true + axis : + data_type : int + support_tensor : true extra : attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] - op : split_with_num scalar : - axis : - data_type : int - support_tensor : true - tensor_name : AxisTensor + axis : + data_type : int + support_tensor : true + tensor_name : AxisTensor - op : sqrt backward : sqrt_grad, sqrt_double_grad (sqrt_grad_grad) From 47b04cb9294039ac8b09ab4efd35cbad7a37938e Mon Sep 17 00:00:00 2001 From: Botao Zhou <1095497213@qq.com> Date: Wed, 9 Oct 2024 10:44:43 +0800 Subject: [PATCH 036/135] refine op id message when catch exception in pir (#68492) * refine op id message when catch exception in pir * fix * fix * fix * fix * fix * fix * rerun * fix --- .../framework/new_executor/pir_interpreter.cc | 17 ++++++-- .../pir/transforms/pd_op_to_kernel_pass.cc | 23 ++++++++++ .../standalone_executor_pir_test.cc | 42 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index b038d89bde198..2d1aadcd5fe75 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -1949,9 +1949,20 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) { const std::vector op_callstack_attr = interpreter::GetInstructionCallStack(op->name(), op->attributes()); framework::InsertCallStackInfo(op->name(), op_callstack_attr, &ex); - LOG(WARNING) << " OP id:" << instr_node->Id() << " " << instr_node->Name() - << " raises an EnforceNotMet exception " - << common::demangle(typeid(ex).name()); + if (op->HasAttribute("origin_id")) { + LOG(WARNING) + << "Instruction OP id: " << instr_node->Id() << ", Ir OP id: " + << op->attribute("origin_id").dyn_cast().data() + << ", " << instr_node->Name() << " raises an EnforceNotMet exception " + << common::demangle(typeid(ex).name()); + } else { + LOG(WARNING) << "Instruction OP id: " << instr_node->Id() + << ", Ir OP id is null" + << ", " << instr_node->Name() + << " raises an EnforceNotMet exception " + << common::demangle(typeid(ex).name()); + } + exception_holder_.Catch(std::make_exception_ptr(std::move(ex))); } catch (platform::EOFException&) { exception_holder_.Catch(std::current_exception()); diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 8047630858f54..f9ffa5c4b9f0a 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -528,6 +528,8 @@ static pir::Value AddPlaceTransferOp(pir::Value in, pir::OpInfo kernel_op_info = ctx->GetRegisteredOpInfo(PhiKernelOp::name()); pir::Operation* op = pir::Operation::Create({in}, op_attribute, {out_type}, kernel_op_info); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); + auto in_op = in.defining_op(); if (in_op && in_op->HasAttribute(kAttrIsPersistable)) { op->set_attribute(kAttrIsPersistable, in_op->attribute(kAttrIsPersistable)); @@ -567,6 +569,7 @@ static pir::Value AddOneDNN2PaddleLayoutTransferOp( pir::OpInfo kernel_op_info = ctx->GetRegisteredOpInfo(PhiKernelOp::name()); pir::Operation* op = pir::Operation::Create({in}, op_attribute, {out_type}, kernel_op_info); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); auto in_op = in.defining_op(); if (in_op && in_op->HasAttribute(kAttrIsPersistable)) { @@ -819,6 +822,7 @@ pir::Value AddDtypeTransferOp(pir::Value in, pir::Operation* op = pir::Operation::Create( {in}, op_attribute, {output_types}, kernel_op_info); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); auto in_op = in.defining_op(); if (in_op && in_op->HasAttribute(kAttrIsPersistable)) { @@ -1679,6 +1683,9 @@ void AddShadowFeedForValue( attr_map, {out_type}, phi_kernel_op_info); + shadow_op->set_attribute("origin_id", + pir::Int64Attribute::get(ctx, shadow_op->id())); + block->push_back(shadow_op); (*map_op_pair)[op_item] = shadow_op; (*map_value_pair)[op_item->result(index)] = shadow_op->result(0); @@ -1719,6 +1726,8 @@ void AddShadowFeedForValue( attr_map, {out_type}, phi_kernel_op_info); + shadow_tensors_op->set_attribute( + "origin_id", pir::Int64Attribute::get(ctx, shadow_tensors_op->id())); block->push_back(shadow_tensors_op); (*map_op_pair)[op_item] = shadow_tensors_op; (*map_value_pair)[op_item->result(index)] = shadow_tensors_op->result(0); @@ -2061,6 +2070,7 @@ void HandleForSpecialOp( pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name()); pir::Operation* op = pir::Operation::Create( vec_inputs, op_item->attributes(), op_output_types, op_info); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); block->push_back(op); (*map_op_pair)[op_item] = op; @@ -2350,6 +2360,7 @@ void HandleForCustomOp( pir::Operation* op = nullptr; op = pir::Operation::Create( vec_inputs, op_attribute, op_output_types, custom_kernel_op_info); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); (*map_op_pair)[op_item] = op; @@ -2412,6 +2423,7 @@ void HandleForTensorRTOp( pir::Operation* op = nullptr; op = pir::Operation::Create( vec_inputs, op_attribute, op_output_types, trt_op_info); + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); (*map_op_pair)[op_item] = op; @@ -2608,6 +2620,8 @@ std::vector BuildInputs( pir::Type target_vec_type = pir::VectorType::get(ctx, types_in_vec); pir::Operation* operation = pir::Operation::Create( new_vec_inputs, {}, {target_vec_type}, op_info); + operation->set_attribute( + "origin_id", pir::Int64Attribute::get(ctx, operation->id())); new_in.defining_op()->ReplaceAllUsesWith(operation->results()); block->erase(*new_in.defining_op()); @@ -2779,6 +2793,8 @@ std::vector BuildInputs( pir::Type target_vec_type = pir::VectorType::get(ctx, types_in_vec); pir::Operation* operation = pir::Operation::Create( inner_inputs, {}, {target_vec_type}, op_info); + operation->set_attribute( + "origin_id", pir::Int64Attribute::get(ctx, operation->id())); new_in = operation->result(0); block->push_back(operation); @@ -3149,6 +3165,7 @@ pir::Operation* BuildKernelOp( vec_inputs, op_attribute, op_output_types, phi_kernel_op_info); } } + op->set_attribute("origin_id", pir::Int64Attribute::get(ctx, op->id())); (*map_op_pair)[op_item] = op; // only deal with single output if (op_item->num_results() > 0) { @@ -3184,6 +3201,8 @@ pir::Operation* OneDNNOp2PdOp(pir::Operation* op_item, op_item->attributes(), op_item_inner_output_types, op_info); + op_item_inner->set_attribute( + "origin_id", pir::Int64Attribute::get(ctx, op_item_inner->id())); op_item->ReplaceAllUsesWith(op_item_inner->results()); for (auto iter = block->begin(); iter != block->end(); ++iter) { // NOLINT if (*iter == *op_item) { @@ -3220,6 +3239,8 @@ pir::Operation* PdOp2OneDNNOp(pir::Operation* op_item, attributes, op_item_inner_output_types, op_info); + op_item_inner->set_attribute( + "origin_id", pir::Int64Attribute::get(ctx, op_item_inner->id())); op_item->ReplaceAllUsesWith(op_item_inner->results()); for (auto iter = block->begin(); iter != block->end(); ++iter) { // NOLINT if (*iter == *op_item) { @@ -3273,6 +3294,8 @@ void ProcessBlock( ctx->GetRegisteredOpInfo(PhiKernelOp::name()); pir::Operation* shadow_op = pir::Operation::Create( {(*map_value_pair)[arg]}, attr_map, {out_type}, phi_kernel_op_info); + shadow_op->set_attribute( + "origin_id", pir::Int64Attribute::get(ctx, shadow_op->id())); new_block->push_back(shadow_op); (*map_value_pair)[arg] = shadow_op->result(0); diff --git a/test/cpp/new_executor/standalone_executor_pir_test.cc b/test/cpp/new_executor/standalone_executor_pir_test.cc index b6feb4e136bbe..dade2c75c37ea 100644 --- a/test/cpp/new_executor/standalone_executor_pir_test.cc +++ b/test/cpp/new_executor/standalone_executor_pir_test.cc @@ -98,6 +98,48 @@ TEST(StandaloneExecutor, run) { EXPECT_EQ(res3, true); } +TEST(StandaloneExecutor, run_error) { + pir::IrContext* ctx = pir::IrContext::Instance(); + pir::Program program((ctx)); + + ctx->GetOrRegisterDialect(); + + pir::Builder builder = pir::Builder(ctx, program.block()); + + paddle::dialect::FullOp op1 = builder.Build( + std::vector{2, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace()); + + paddle::dialect::FullOp op2 = builder.Build( + std::vector{2, 2}, 1.0, phi::DataType::FLOAT64, phi::CPUPlace()); + + auto add_op = + builder.Build(op1->result(0), op2->result(0)); + + std::string out_name = "add_out"; + builder.Build(add_op->result(0), out_name); + + auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program); + + for (auto op : kernel_program->block()->ops()) { + op->erase_attribute("origin_id"); + } + + auto place = phi::CPUPlace(); + Scope scope; + + InterpreterCore test_core(place, {}, kernel_program->block(), &scope); + + test_core.SetSkipGcVars({out_name}); + + try { + test_core.Run({}); + } catch (std::exception& e) { + bool is_catch = + std::string(e.what()).find("InvalidArgumentError") != std::string::npos; + EXPECT_EQ(is_catch, true); + } +} + TEST(StandaloneExecutor, run_feed_tensor) { pir::IrContext* ctx = pir::IrContext::Instance(); pir::Program program(ctx); From 2124f6e2cbf73df076a3c531a4de254c8df81a1c Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 9 Oct 2024 11:36:04 +0800 Subject: [PATCH 037/135] Modify include onednn_op_list.h [fluid_ops] (#68563) --- paddle/fluid/framework/operator.cc | 2 +- paddle/fluid/imperative/prepared_operator.cc | 2 +- paddle/{fluid => phi/core}/platform/onednn_op_list.h | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename paddle/{fluid => phi/core}/platform/onednn_op_list.h (100%) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7746bf804b97a..55152e17048ed 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -54,7 +54,7 @@ class DenseTensor; #ifdef PADDLE_WITH_DNNL #include "paddle/fluid/platform/onednn_helper.h" -#include "paddle/fluid/platform/onednn_op_list.h" +#include "paddle/phi/core/platform/onednn_op_list.h" #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 5923a34274dbe..6829115d7cffb 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -26,7 +26,7 @@ #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif #ifdef PADDLE_WITH_DNNL -#include "paddle/fluid/platform/onednn_op_list.h" +#include "paddle/phi/core/platform/onednn_op_list.h" #endif #include "paddle/common/flags.h" #include "paddle/fluid/framework/library_type.h" diff --git a/paddle/fluid/platform/onednn_op_list.h b/paddle/phi/core/platform/onednn_op_list.h similarity index 100% rename from paddle/fluid/platform/onednn_op_list.h rename to paddle/phi/core/platform/onednn_op_list.h From 9b8e47b2e7afd68b76de82297cb414e2a77c4b0b Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 9 Oct 2024 11:39:41 +0800 Subject: [PATCH 038/135] Modify include collective_helper.h [fluid_ops] (#68562) * Fix * Fix --- paddle/fluid/framework/data_feed.cu | 2 +- paddle/fluid/framework/ir/graph_helper.cc | 2 +- .../control_flow/if_instruction.cc | 2 +- .../control_flow/pylayer_instruction.cc | 2 +- .../control_flow/while_instruction.cc | 2 +- .../instruction/instruction_base.cc | 2 +- .../instruction/instruction_util.cc | 2 +- .../instruction/onednn/onednn_instruction.cc | 2 +- .../onednn/onednn_mixed_instruction.cc | 2 +- .../instruction/phi_kernel_instruction.cc | 2 +- .../interpreter/stream_analyzer.cc | 2 +- paddle/fluid/imperative/bkcl_context.cc | 2 +- paddle/fluid/imperative/heter_ccl_context.cc | 2 +- paddle/fluid/imperative/nccl_context.cc | 2 +- paddle/fluid/imperative/xccl_context.cc | 2 +- .../tensorrt/plugin/c_allreduce_op_plugin.cu | 2 +- paddle/fluid/platform/collective_helper.cc | 2 +- paddle/fluid/platform/collective_helper.h | 21 ------------------- .../fluid/platform/device/gpu/nccl_helper.h | 2 +- .../fluid/platform/device/xpu/bkcl_helper.h | 2 +- paddle/fluid/pybind/pybind.cc | 2 +- 21 files changed, 20 insertions(+), 41 deletions(-) delete mode 100644 paddle/fluid/platform/collective_helper.h diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index 067b89a5acb5f..91b91b4f7a2fa 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -32,7 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/io/fs.h" -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/kernels/gpu/graph_reindex_funcs.h" #include "paddle/phi/kernels/graph_reindex_kernel.h" diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d773651266f39..54b7889dc5c7f 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -25,8 +25,8 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/platform/collective_helper.h" COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif #include "paddle/common/flags.h" diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc index 7e35e302b7c35..d3417759afe0a 100644 --- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc @@ -23,9 +23,9 @@ #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/type_defs.h" diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc index be5d26f66c062..f8cecafa31241 100644 --- a/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/control_flow/pylayer_instruction.cc @@ -22,9 +22,9 @@ #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/type_defs.h" diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc index 994ebd60bf285..3b47335760e18 100644 --- a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc @@ -23,9 +23,9 @@ #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/type_defs.h" diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index 4d78b715b8875..a22bed3cf14f4 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/pir/include/core/builtin_attribute.h" namespace paddle::framework { diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc index 1667198e2a5f7..6d1a974e36c31 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc @@ -37,9 +37,9 @@ #include "paddle/pir/include/core/block_argument.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/common/flags.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/platform/collective_helper.h" COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc index c80c75afa7736..09f4b7c2a7b58 100644 --- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc @@ -20,9 +20,9 @@ #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h" #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/type_defs.h" diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc index 54c08d7d63c77..6030367e7bd4a 100644 --- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc @@ -22,9 +22,9 @@ #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/type_defs.h" diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index affd52eeb22be..f94a8db4e7050 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -22,10 +22,10 @@ #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/type_defs.h" #include "paddle/pir/include/core/builtin_attribute.h" diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 236896ffaded0..9bd2c406bb157 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -22,9 +22,9 @@ #include "paddle/phi/core/platform/device_context.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/common/flags.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/platform/collective_helper.h" COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index c65e2e10c92a6..1c3fe261d7aa1 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -22,9 +22,9 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/xpu/bkcl_helper.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/gen_comm_id_helper.h" #include "paddle/utils/string/split.h" diff --git a/paddle/fluid/imperative/heter_ccl_context.cc b/paddle/fluid/imperative/heter_ccl_context.cc index de5e16dbf90ba..c0b37027654ed 100644 --- a/paddle/fluid/imperative/heter_ccl_context.cc +++ b/paddle/fluid/imperative/heter_ccl_context.cc @@ -20,8 +20,8 @@ #endif #include "paddle/fluid/framework/fleet/gloo_wrapper.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/gen_comm_id_helper.h" #include "paddle/utils/string/split.h" diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 253aa2e97b8a9..58b946797cdff 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -16,7 +16,7 @@ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/imperative/all_reduce.h" -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/gen_comm_id_helper.h" #endif diff --git a/paddle/fluid/imperative/xccl_context.cc b/paddle/fluid/imperative/xccl_context.cc index 6332555ec4be0..ad4925c714d9f 100644 --- a/paddle/fluid/imperative/xccl_context.cc +++ b/paddle/fluid/imperative/xccl_context.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/imperative/xccl_context.h" #if defined(PADDLE_WITH_CUSTOM_DEVICE) -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/gen_comm_id_helper.h" #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu index 1b163b74610bb..0cab9341b0949 100644 --- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu @@ -16,9 +16,9 @@ #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/utils.h" +#include "paddle/phi/core/platform/collective_helper.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/common/flags.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index e0f9731cf8531..956204114554d 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/platform/collective_helper.h" namespace paddle { namespace platform {} // namespace platform diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h deleted file mode 100644 index 7d0d474d0e4c0..0000000000000 --- a/paddle/fluid/platform/collective_helper.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/platform/collective_helper.h" - -namespace paddle { -namespace platform {} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index f333db5d35d79..93e1278861bcb 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -25,7 +25,7 @@ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/phi/core/platform/collective_helper.h" #ifdef PADDLE_WITH_NCCL #include "paddle/phi/backends/dynload/nccl.h" #endif diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h index 071ce16077361..7499a0059f842 100644 --- a/paddle/fluid/platform/device/xpu/bkcl_helper.h +++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h @@ -29,10 +29,10 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device_context.h" #include "xpu/bkcl.h" #include "xpu/runtime.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d180413a62703..6e98f047be2a3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -172,9 +172,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/fluid/operators/custom_device_common_op_registry.h" -#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h" #include "paddle/phi/capi/capi.h" +#include "paddle/phi/core/platform/collective_helper.h" #include "paddle/phi/core/platform/device/custom/custom_device_resource_pool.h" #endif From 0446e608256188f284b1a1aede2ddb7b45534564 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 9 Oct 2024 11:44:42 +0800 Subject: [PATCH 039/135] Modify include device_wrapper.h [fluid_ops] (#68564) * Fix * Fix * Fix --- paddle/fluid/framework/data_type_transform.cc | 2 +- paddle/fluid/framework/garbage_collector.cc | 2 +- paddle/fluid/framework/operator.cc | 2 +- paddle/fluid/imperative/tracer.cc | 2 +- .../fluid/operators/reader/buffered_reader.cc | 2 +- paddle/fluid/platform/device/device_wrapper.h | 39 ------------------- paddle/fluid/platform/init.cc | 2 +- .../fluid/platform/stream_callback_manager.cc | 2 +- paddle/fluid/pybind/compiled_program.cc | 2 +- paddle/fluid/pybind/place.cc | 2 +- paddle/fluid/pybind/pybind.cc | 2 +- paddle/fluid/pybind/tensor.cc | 2 +- paddle/fluid/pybind/tensor_py.h | 6 +-- .../cpp/fluid/memory/system_allocator_test.cc | 2 +- 14 files changed, 15 insertions(+), 54 deletions(-) delete mode 100644 paddle/fluid/platform/device/device_wrapper.h diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 64073aa5e72ca..9fba57e10fd0b 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/common/transform.h" #if defined(PADDLE_WITH_XPU) -#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 783f65dcae9a0..fd65efd946356 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -18,7 +18,7 @@ #endif #include "paddle/common/flags.h" #include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" COMMON_DECLARE_double(eager_delete_tensor_gb); COMMON_DECLARE_double(memory_fraction_of_eager_deletion); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 55152e17048ed..adc6dfcf20afc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -32,7 +32,6 @@ limitations under the License. */ #include "paddle/fluid/operators/isfinite_op.h" #include "paddle/fluid/operators/ops_extra_info.h" #include "paddle/fluid/operators/ops_signature/signatures.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" @@ -41,6 +40,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/profiler.h" namespace phi { diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 1747c98fafd8b..398d09590d36b 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -26,12 +26,12 @@ #include "paddle/fluid/imperative/layout_autotune.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/operators/ops_extra_info.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/platform/denormal.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/profiler.h" #include "paddle/utils/string/string_helper.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 0e4aeedd90f2c..83f71a4d180aa 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -15,8 +15,8 @@ #include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/profiler.h" #include "paddle/phi/backends/device_guard.h" diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h deleted file mode 100644 index 011cb134bb2e2..0000000000000 --- a/paddle/fluid/platform/device/device_wrapper.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/**************************** Enforce Wrapper **************************/ - -#pragma once - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/phi/core/platform/device/gpu/gpu_info.h" -#endif - -#ifdef PADDLE_WITH_XPU -#include "paddle/phi/backends/xpu/enforce_xpu.h" -#include "paddle/phi/core/platform/device/xpu/xpu_info.h" -#endif - -#ifdef PADDLE_WITH_IPU -#include "paddle/fluid/platform/device/ipu/ipu_info.h" -#endif - -#ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/phi/backends/callback_manager.h" -#include "paddle/phi/backends/custom/enforce_custom.h" -#include "paddle/phi/backends/device_guard.h" -#include "paddle/phi/backends/device_manager.h" -#include "paddle/phi/backends/event.h" -#include "paddle/phi/backends/stream.h" -#endif diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 395e4d3c0089c..26ce8dbe04433 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -25,10 +25,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cupti.h" #endif -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/os_info.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/device_context.h" #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 6719a1b6e97bc..957e1d079176f 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/platform/stream_callback_manager.h" -#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/compiled_program.cc b/paddle/fluid/pybind/compiled_program.cc index 06bc07bf2be71..e09ad6f4d441f 100755 --- a/paddle/fluid/pybind/compiled_program.cc +++ b/paddle/fluid/pybind/compiled_program.cc @@ -72,7 +72,6 @@ #include "paddle/phi/core/memory/allocation/cuda_ipc_allocator.h" #endif #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler/event_python.h" @@ -110,6 +109,7 @@ #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/memory/allocation/mmap_allocator.h" #include "paddle/phi/core/platform/cpu_helper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/monitor.h" #include "paddle/phi/core/platform/profiler.h" diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index 67433be39069c..683ea8feb8e6f 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -71,7 +71,6 @@ limitations under the License. */ #include "paddle/phi/core/memory/allocation/cuda_ipc_allocator.h" #endif #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler/event_python.h" @@ -109,6 +108,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/memory/allocation/mmap_allocator.h" #include "paddle/phi/core/platform/cpu_helper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/monitor.h" #include "paddle/phi/core/platform/profiler.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6e98f047be2a3..20b3588311a49 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -87,7 +87,6 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/ops_extra_info.h" #include "paddle/fluid/operators/py_func_op.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler/event_python.h" @@ -133,6 +132,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/memory/allocation/mmap_allocator.h" #include "paddle/phi/core/platform/cpu_helper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/monitor.h" #include "paddle/phi/core/platform/profiler.h" diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index e82f87a433e98..742704ef0722a 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -71,7 +71,6 @@ limitations under the License. */ #include "paddle/phi/core/memory/allocation/cuda_ipc_allocator.h" #endif #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler/event_python.h" @@ -109,6 +108,7 @@ limitations under the License. */ #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/memory/allocation/mmap_allocator.h" #include "paddle/phi/core/platform/cpu_helper.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/monitor.h" #include "paddle/phi/core/platform/profiler.h" diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index ac39519159fc9..5977245b7623e 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -30,10 +30,10 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/pybind/complex.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/memory/memcpy.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h" @@ -985,7 +985,7 @@ inline phi::DenseTensor *PySliceTensor(const phi::DenseTensor &self, } inline py::array TensorToPyArray(const phi::DenseTensor &tensor, - py::object copy = py::none()) { + py::object copy = py::none()) { if (!tensor.IsInitialized()) { return py::array(); } @@ -1013,7 +1013,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, framework::TransToProtoVarType(tensor.dtype())); if (!is_gpu_tensor && !is_xpu_tensor && !is_custom_device_tensor) { - if (!copy.is_none()&& !copy) { + if (!copy.is_none() && !copy) { auto base = py::cast(std::move(tensor)); return py::array(py::dtype(py_dtype_str.c_str()), py_dims, diff --git a/test/cpp/fluid/memory/system_allocator_test.cc b/test/cpp/fluid/memory/system_allocator_test.cc index 7a88512c2cfe1..b23c715efeb3b 100644 --- a/test/cpp/fluid/memory/system_allocator_test.cc +++ b/test/cpp/fluid/memory/system_allocator_test.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include #include "paddle/common/flags.h" -#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/phi/core/memory/allocation/allocator.h" +#include "paddle/phi/core/platform/device/device_wrapper.h" COMMON_DECLARE_bool(use_pinned_memory); From 14c5719ac3f8dbca3487430d9f4006df49b4d9ac Mon Sep 17 00:00:00 2001 From: ming1753 <61511741+ming1753@users.noreply.github.com> Date: Wed, 9 Oct 2024 13:25:19 +0800 Subject: [PATCH 040/135] blha support rope_theta attr (#68573) --- paddle/phi/infermeta/fusion.cc | 3 +++ paddle/phi/infermeta/fusion.h | 2 ++ paddle/phi/kernels/fusion/gpu/block_attn.h | 2 ++ .../fusion/gpu/block_multi_head_attention_kernel.cu | 7 +++++++ paddle/phi/ops/yaml/fused_ops.yaml | 4 ++-- .../incubate/nn/functional/block_multihead_attention.py | 7 +++++++ 6 files changed, 23 insertions(+), 2 deletions(-) diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index d193a996edb44..4a4ebaeec5b01 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -284,6 +284,7 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv, const float quant_min_bound, const float out_scale, const std::string& compute_dtype, + const float rope_theta, MetaTensor* fmha_out, MetaTensor* qkv_out, MetaTensor* key_cache_out, @@ -429,6 +430,7 @@ void BlockMultiheadAttentionInferXPUMeta( const float quant_min_bound, const float out_scale, const std::string& compute_dtype, + const float rope_theta, MetaTensor* fmha_out, MetaTensor* qkv_out, MetaTensor* key_cache_out, @@ -468,6 +470,7 @@ void BlockMultiheadAttentionInferXPUMeta( quant_min_bound, out_scale, compute_dtype, + rope_theta, fmha_out, qkv_out, key_cache_out, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index eec513d179547..2cbdc389a24c6 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -123,6 +123,7 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv, const float quant_min_bound, const float out_scale, const std::string& compute_dtype, + const float rope_theta, MetaTensor* fmha_out, MetaTensor* qkv_out, MetaTensor* key_cache_out, @@ -166,6 +167,7 @@ void BlockMultiheadAttentionInferXPUMeta( const float quant_min_bound, const float out_scale, const std::string& compute_dtype, + const float rope_theta, MetaTensor* fmha_out, MetaTensor* qkv_out, MetaTensor* key_cache_out, diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h index d31739f96278c..8127f8026f3da 100644 --- a/paddle/phi/kernels/fusion/gpu/block_attn.h +++ b/paddle/phi/kernels/fusion/gpu/block_attn.h @@ -1642,6 +1642,7 @@ void blha(const phi::GPUContext &dev_ctx, const int timestep, const int rotary_emb_dims, float inv_sqrt_dh, + const float rope_theta, const bool add_qkv_bias = true, const bool neox_rotary_style = false, const int quant_round_type = 1, @@ -1731,6 +1732,7 @@ void blha(const phi::GPUContext &dev_ctx, params.timestep = timestep + pre_cache_length; params.inv_sqrt_dh = inv_sqrt_dh; params.rotary_emb_dims = rotary_emb_dims; + params.rope_theta = rope_theta; VLOG(3) << "batch_size: " << batch_size << " q_num_head: " << q_num_head << " kv_num_head: " << kv_num_head << " block_size: " << block_size diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu index a71f6acea28f9..40ec00ccbb0d9 100644 --- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu @@ -313,6 +313,7 @@ void DispatchWithDtype( const float quant_min_bound, const float out_scale, const std::string& compute_dtype, + const float rope_theta, DenseTensor* fmha_out, DenseTensor* qkv_out, DenseTensor* key_cache_out, @@ -754,6 +755,7 @@ void DispatchWithDtype( max_dec_len_this_time_data, rope_emb ? 1 : 0, 1. / sqrt(dim_head), + rope_theta, /*compute_bias*/ false, use_neox_style, quant_round_type, @@ -859,6 +861,7 @@ void BlockMultiheadAttentionKernel( const float quant_min_bound, const float out_scale, const std::string& compute_dtype, + const float rope_theta, DenseTensor* fmha_out, DenseTensor* qkv_out, DenseTensor* key_cache_out, @@ -903,6 +906,7 @@ void BlockMultiheadAttentionKernel( quant_min_bound, out_scale, compute_dtype, + rope_theta, fmha_out, qkv_out, key_cache_out, @@ -945,6 +949,7 @@ void BlockMultiheadAttentionKernel( quant_min_bound, out_scale, compute_dtype, + rope_theta, fmha_out, qkv_out, key_cache_out, @@ -990,6 +995,7 @@ void BlockMultiheadAttentionKernel( quant_min_bound, out_scale, compute_dtype, + rope_theta, fmha_out, qkv_out, key_cache_out, @@ -1032,6 +1038,7 @@ void BlockMultiheadAttentionKernel( quant_min_bound, out_scale, compute_dtype, + rope_theta, fmha_out, qkv_out, key_cache_out, diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml index 826cda70c1ec8..19c18678784ad 100644 --- a/paddle/phi/ops/yaml/fused_ops.yaml +++ b/paddle/phi/ops/yaml/fused_ops.yaml @@ -43,7 +43,7 @@ support_dygraph_mode : true - op : block_multihead_attention_ - args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask, Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype = "default") + args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask, Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype="default", float rope_theta=10000.0) output : Tensor(fmha_out), Tensor(qkv_out), Tensor(key_cache_out), Tensor(value_cache_out) infer_meta : func : BlockMultiheadAttentionInferMeta @@ -57,7 +57,7 @@ skip_transform : max_enc_len_this_time, max_dec_len_this_time - op : block_multihead_attention_xpu - args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor cache_k_per_batch_maxs, Tensor cache_v_per_batch_maxs, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask, Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype = "default") + args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor cache_k_per_batch_maxs, Tensor cache_v_per_batch_maxs, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask, Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype="default", float rope_theta=10000.0) output : Tensor(fmha_out), Tensor(qkv_out), Tensor(key_cache_out), Tensor(value_cache_out) infer_meta : func : BlockMultiheadAttentionInferXPUMeta diff --git a/python/paddle/incubate/nn/functional/block_multihead_attention.py b/python/paddle/incubate/nn/functional/block_multihead_attention.py index fcd936f12f8c1..f38c0a06b1ef0 100644 --- a/python/paddle/incubate/nn/functional/block_multihead_attention.py +++ b/python/paddle/incubate/nn/functional/block_multihead_attention.py @@ -67,6 +67,7 @@ def block_multihead_attention( quant_min_bound: float = -127.0, out_scale: float = -1, compute_dtype: str = "default", + rope_theta: float = 10000.0, ) -> tuple[Tensor, Tensor, Tensor, Tensor]: """ Block Multi-head attention for text summarization. @@ -107,6 +108,7 @@ def block_multihead_attention( quant_min_bound (Float32): The min bound of float type to int type. out_scale (Float32): The quant scale of fmha_out. Default is -1, which means do not apply quantization for fmha_out. compute_dtype (Str): A compute dtype, is used to represent the input data type. Default is "default", which means compute dtype is determined by input dtype. However, if the dtype of input is Int32, this value should be set to actual dtype of the model. + rope_theta (Float32): The theta of RoPE. Default is 10000.0. Returns: Tensor|(output, qkv_out, cache_k_out, cache_v_out), which output is the output of block_multihead_attention layers, qkv_out is inplace with input `qkv`, cache_k_out and cache_v_out are inplace with input `cache_k` and `cache_v`. @@ -334,6 +336,7 @@ def block_multihead_attention( quant_min_bound, out_scale, compute_dtype, + rope_theta, ) helper = LayerHelper('block_multihead_attention', **locals()) @@ -402,6 +405,7 @@ def block_multihead_attention( 'quant_min_bound': quant_min_bound, 'out_scale': out_scale, 'compute_dtype': compute_dtype, + 'rope_theta': rope_theta, }, ) return out, qkv, key_cache, value_cache @@ -445,6 +449,7 @@ def block_multihead_attention_xpu( quant_min_bound: float = -127.0, out_scale: float = -1, compute_dtype: str = "default", + rope_theta: float = 10000.0, ) -> tuple[Tensor, Tensor, Tensor, Tensor]: if in_dynamic_mode(): return _C_ops.block_multihead_attention_xpu( @@ -485,6 +490,7 @@ def block_multihead_attention_xpu( quant_min_bound, out_scale, compute_dtype, + rope_theta, ) helper = LayerHelper('block_multihead_attention_xpu', **locals()) @@ -555,6 +561,7 @@ def block_multihead_attention_xpu( 'quant_min_bound': quant_min_bound, 'out_scale': out_scale, 'compute_dtype': compute_dtype, + 'rope_theta': rope_theta, }, ) return out, qkv, key_cache, value_cache From 8fdba0b1a4af3d974a598ea6dabfd2160aba754e Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:18:34 +0800 Subject: [PATCH 041/135] =?UTF-8?q?=E3=80=90pir=E3=80=91modify=20name=20of?= =?UTF-8?q?=20Fetch=20input=20(#68530)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modify old ir put_along_axis api * upate * modify paddletest * modify fetch input name --------- Co-authored-by: phlrain --- python/paddle/static/pir_io.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/static/pir_io.py b/python/paddle/static/pir_io.py index d25b332a9beff..350275b096d89 100644 --- a/python/paddle/static/pir_io.py +++ b/python/paddle/static/pir_io.py @@ -326,10 +326,11 @@ def normalize_pir_program(program, feed_vars, fetch_vars, **kwargs): fetch_vars_tuple = [] for i, var in enumerate(clone_fetch_vars): - if "name" in var.get_defining_op().attrs(): - fetch_vars_tuple.append( - (var, var.get_defining_op().attrs()['name']) - ) + scale_op = var.get_defining_op() + if scale_op.name() == "pd_op.scale": + orig_var = scale_op.operand_source(0) + if orig_var.has_name: + fetch_vars_tuple.append((orig_var, orig_var.name)) else: fetch_vars_tuple.append((var, "fetch_name_" + str(i))) with paddle.static.program_guard(copy_program): From bf5fedd716f31f587a1733053dd4a3ef6e022ef4 Mon Sep 17 00:00:00 2001 From: crazyxiaoxi <113622186+crazyxiaoxi@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:57:01 +0800 Subject: [PATCH 042/135] =?UTF-8?q?[CINN]=20=E3=80=90Infer=20Symbolic=20Sh?= =?UTF-8?q?ape=20BUAA=20=E3=80=91Add=20matmul=5Fwith=5Fflatten=20op=20=20(?= =?UTF-8?q?#68446)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * first * fix * codestyle * fix * fix --------- Co-authored-by: Jeff114514 <928430299@qq.com> --- .../infer_symbolic_shape/binary_infer_sym.cc | 84 +++++++++++++++++-- .../infer_symbolic_shape/binary_infer_sym.h | 2 +- paddle/phi/ops/yaml/legacy/static_ops.yaml | 1 + 3 files changed, 80 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc index 24ea5653a23e1..67d687fadd59a 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc @@ -1315,12 +1315,84 @@ bool MarginCrossEntropyOpInferSymbolicShape( return true; } -// bool MatmulWithFlattenOpInferSymbolicShape(pir::Operation *op, -// pir::InferSymbolicShapeContext -// *infer_context) { -// // pass -// return true; -// } +bool MatmulWithFlattenOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &x_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + const auto &y_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(1)); + std::vector x_dims = x_shape_or_data.shape(); + std::vector y_dims = y_shape_or_data.shape(); + + int x_num_col_dims = + op->attribute("x_num_col_dims").data(); + int y_num_col_dims = + op->attribute("y_num_col_dims").data(); + + PADDLE_ENFORCE_GT( + x_dims.size(), + x_num_col_dims, + common::errors::InvalidArgument( + "The input tensor X's dimensions of MulOp " + "should be larger than x_num_col_dims. But received X's " + "dimensions = %d, X's shape = [%s], x_num_col_dims = %d.", + x_dims.size(), + x_dims, + x_num_col_dims)); + PADDLE_ENFORCE_GT( + y_dims.size(), + y_num_col_dims, + common::errors::InvalidArgument( + "The input tensor Y's dimensions of MulOp " + "should be larger than y_num_col_dims. But received Y's " + "dimensions = %d, Y's shape = [%s], y_num_col_dims = %d.", + y_dims.size(), + y_dims, + y_num_col_dims)); + + auto slice = + [](const std::vector &dims, int begin, int end) { + std::vector slice_dims; + slice_dims.reserve(end - begin); + for (int i = begin; i < end; ++i) { + slice_dims.push_back(dims[i]); + } + return slice_dims; + }; + auto x_mat_dims = slice(x_dims, x_num_col_dims, x_dims.size()); + auto y_mat_dims = slice(y_dims, 0, y_num_col_dims); + + PADDLE_ENFORCE_EQ(x_mat_dims.size(), + y_mat_dims.size(), + common::errors::InvalidArgument( + "The second dimension of input x_mat_dims should be " + "equal to the first dimension of input y_mat_dims. But " + "received X's shape = [%s], Y's shape = [%s].", + x_mat_dims.size(), + y_mat_dims.size())); + + for (size_t i = 0; i < x_mat_dims.size(); ++i) { + infer_context->AddEqualCstr(x_mat_dims[i], y_mat_dims[i]); + } + + std::vector output_dims; + output_dims.reserve( + static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); + + for (size_t i = 0; i < static_cast(x_num_col_dims); ++i) { + output_dims.push_back(x_dims[i]); + } + for (size_t i = static_cast(y_num_col_dims); i < y_dims.size(); ++i) { + output_dims.push_back(y_dims[i]); + } + + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(output_dims)}); + + return true; +} bool MvOpInferSymbolicShape(pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h index 9ab5332b09409..aa8509698f2d1 100755 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.h @@ -73,7 +73,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(MaskedSelect) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Matmul) OP_DECLARE_INFER_SYMBOLIC_SHAPE(MatrixNms) OP_DECLARE_INFER_SYMBOLIC_SHAPE(MarginCrossEntropy) -// OP_DECLARE_INFER_SYMBOLIC_SHAPE(MatmulWithFlatten) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(MatmulWithFlatten) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Mv) OP_DECLARE_INFER_SYMBOLIC_SHAPE(PriorBox) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(PullBoxSparse) diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml index 2c32090a7ec4f..498a07004c564 100755 --- a/paddle/phi/ops/yaml/legacy/static_ops.yaml +++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml @@ -539,6 +539,7 @@ func : matmul_with_flatten data_type : x backward : matmul_with_flatten_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface - op : matrix_rank args : (Tensor x, Tensor tol_tensor, float tol=0.0f, bool hermitian=false, bool use_default_tol=true) From c16ee765fe1e35a5fd653ced5563e7f23d24923b Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 9 Oct 2024 15:19:27 +0800 Subject: [PATCH 043/135] [DLPACK] Fix size of allocation in from_dlpack for strided Tensor (#68559) corresponding unittest --- paddle/fluid/framework/tensor_util.cc | 19 ++++++++---- test/legacy_test/test_dlpack.py | 43 +++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 20cd5baa22e76..7d8ace4371d12 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -872,13 +872,9 @@ phi::DenseTensor from_blob(void* data, phi::DataType dtype, const phi::Place& place, const Deleter& deleter) { - PADDLE_ENFORCE_NOT_NULL( - data, common::errors::InvalidArgument("data can not be nullptr.")); - auto meta = phi::DenseTensorMeta(dtype, shape, strides); - size_t size = SizeOf(dtype) * (meta.is_scalar ? 1 : product(meta.dims)); - phi::Allocation::DeleterFnPtr f = nullptr; + phi::Allocation::DeleterFnPtr f = nullptr; if (deleter) { auto g = [deleter, src](phi::Allocation* p) { if (src->manager_ctx) { @@ -894,7 +890,18 @@ phi::DenseTensor from_blob(void* data, f = DeleterBridge; } - auto alloc = std::make_shared(data, size, f, place); + // Calculate the number of elements of underlying storage + size_t size = 1; + for (auto i = 0; i < shape.size(); ++i) { + if (shape[i] == 0) { + size = 0; + break; + } + size += strides[i] * (shape[i] - 1); + } + + auto alloc = + std::make_shared(data, size * SizeOf(dtype), f, place); return phi::DenseTensor(alloc, meta); } diff --git a/test/legacy_test/test_dlpack.py b/test/legacy_test/test_dlpack.py index 6f9cad3e253ac..6552bc1a662fe 100644 --- a/test/legacy_test/test_dlpack.py +++ b/test/legacy_test/test_dlpack.py @@ -31,6 +31,7 @@ def test_dlpack_dygraph(self): self.assertTrue( isinstance(out_from_dlpack, paddle.base.core.eager.Tensor) ) + self.assertEqual(str(tensor.place), str(out_from_dlpack.place)) np.testing.assert_array_equal( out_from_dlpack.numpy(), np.array([1, 2, 3, 4]).astype("int") ) @@ -41,6 +42,7 @@ def test_dlpack_tensor_larger_than_2dim(self): t = paddle.to_tensor(numpy_data) dlpack = paddle.utils.dlpack.to_dlpack(t) out = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(str(t.place), str(out.place)) np.testing.assert_allclose(numpy_data, out.numpy(), rtol=1e-05) def test_dlpack_static(self): @@ -100,7 +102,7 @@ def test_dlpack_dtype_and_place_consistency(self): o = paddle.utils.dlpack.from_dlpack(dlpack) self.assertEqual(x.dtype, o.dtype) np.testing.assert_allclose(x.numpy(), o.numpy(), rtol=1e-05) - self.assertEqual(type(x.place), type(o.place)) + self.assertEqual(str(x.place), str(o.place)) complex_dtypes = ["complex64", "complex128"] for place in places: @@ -114,7 +116,7 @@ def test_dlpack_dtype_and_place_consistency(self): o = paddle.utils.dlpack.from_dlpack(dlpack) self.assertEqual(x.dtype, o.dtype) np.testing.assert_allclose(x.numpy(), o.numpy(), rtol=1e-05) - self.assertEqual(type(x.place), type(o.place)) + self.assertEqual(str(x.place), str(o.place)) def test_dlpack_deletion(self): # See Paddle issue 47171 @@ -129,6 +131,7 @@ def test_dlpack_deletion(self): ) dlpack = paddle.utils.dlpack.to_dlpack(a) b = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(str(a.place), str(b.place)) def test_to_dlpack_for_loop(self): # See Paddle issue 50120 @@ -154,6 +157,7 @@ def test_to_dlpack_modification(self): y = paddle.utils.dlpack.from_dlpack(dlpack) y[1:2, 2:5] = 2.0 np.testing.assert_allclose(x.numpy(), y.numpy()) + self.assertEqual(str(x.place), str(y.place)) def test_to_dlpack_data_ptr_consistency(self): # See Paddle issue 50120 @@ -168,6 +172,7 @@ def test_to_dlpack_data_ptr_consistency(self): y = paddle.utils.dlpack.from_dlpack(dlpack) self.assertEqual(x.data_ptr(), y.data_ptr()) + self.assertEqual(str(x.place), str(y.place)) def test_to_dlpack_strides_consistency(self): with dygraph_guard(): @@ -182,6 +187,8 @@ def test_to_dlpack_strides_consistency(self): y = paddle.utils.dlpack.from_dlpack(dlpack) self.assertEqual(x_strided.strides, y.strides) + self.assertEqual(str(x_strided.place), str(y.place)) + np.testing.assert_equal(x_strided.numpy(), y.numpy()) def test_to_dlpack_from_ext_tensor(self): with dygraph_guard(): @@ -192,6 +199,38 @@ def test_to_dlpack_from_ext_tensor(self): self.assertEqual(x.__array_interface__['data'][0], y.data_ptr()) np.testing.assert_allclose(x, y.numpy()) + def test_to_dlpack_from_zero_dim(self): + with dygraph_guard(): + places = [base.CPUPlace()] + if paddle.is_compiled_with_cuda(): + places.append(base.CUDAPlace(0)) + for place in places: + for _ in range(4): + x = paddle.to_tensor(1.0, place=place) + dlpack = paddle.utils.dlpack.to_dlpack(x) + y = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(x.data_ptr(), y.data_ptr()) + self.assertEqual(str(x.place), str(y.place)) + self.assertEqual(y.shape, []) + self.assertEqual(y.numel().item(), 1) + np.testing.assert_array_equal(x.numpy(), y.numpy()) + + def test_to_dlpack_from_zero_size(self): + with dygraph_guard(): + places = [base.CPUPlace()] + if paddle.is_compiled_with_cuda(): + places.append(base.CUDAPlace(0)) + for place in places: + for _ in range(4): + x = paddle.zeros([0, 10]).to(device=place) + dlpack = paddle.utils.dlpack.to_dlpack(x) + y = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(x.data_ptr(), y.data_ptr()) + self.assertEqual(str(x.place), str(y.place)) + self.assertEqual(y.shape, [0, 10]) + self.assertEqual(y.numel().item(), 0) + np.testing.assert_array_equal(x.numpy(), y.numpy()) + class TestRaiseError(unittest.TestCase): def test_to_dlpack_raise_type_error(self): From 50c3f5b34d95648fbad953630812ccb7c6d4a725 Mon Sep 17 00:00:00 2001 From: BiynXu <62832681+BiynXu@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:54:45 +0800 Subject: [PATCH 044/135] [PIR]open squared l2 norm op test (#68490) --- test/legacy_test/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index e5d3356a44528..9f604f36572fa 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -15,11 +15,6 @@ list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op") string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") -# The following unittest is now in deprecated dir, we can delete this code when we move it from deprecated dir to this dir -###### start ###### -list(REMOVE_ITEM TEST_OPS test_squared_l2_norm_op) -###### end ###### - list(REMOVE_ITEM TEST_OPS test_fractional_max_pool3d_op) list(REMOVE_ITEM TEST_OPS test_householder_product) list(REMOVE_ITEM TEST_OPS test_conv2d_op_depthwise_conv) From f25bfbb4c6339acd4bb33940fb4eab3646f76165 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 9 Oct 2024 17:34:55 +0800 Subject: [PATCH 045/135] [Inference]Add expand and expand_as converter (#68546) * add expand and expand_as * fix slice --- .../transforms/tensorrt/trt_op_marker_pass.cc | 2 + python/paddle/tensorrt/converter.py | 4 +- python/paddle/tensorrt/converter_utils.py | 40 ++++++++- python/paddle/tensorrt/impls/manipulation.py | 85 +++++++++++++++++-- test/tensorrt/test_converter_manipulation.py | 62 ++++++++++++++ 5 files changed, 185 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index 03ffe54971d44..04cd8021187c6 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -71,6 +71,7 @@ DEFINE_GENERAL_PATTERN(FusedConv2dAddAct, paddle::dialect::FusedConv2dAddActOp) DEFINE_GENERAL_PATTERN(DepthwiseConv2d, paddle::dialect::DepthwiseConv2dOp) DEFINE_GENERAL_PATTERN(Shape, paddle::dialect::ShapeOp) DEFINE_GENERAL_PATTERN(Expand, paddle::dialect::ExpandOp) +DEFINE_GENERAL_PATTERN(ExpandAs, paddle::dialect::ExpandAsOp) DEFINE_GENERAL_PATTERN(Sigmoid, paddle::dialect::SigmoidOp) DEFINE_GENERAL_PATTERN(Sqrt, paddle::dialect::SqrtOp) DEFINE_GENERAL_PATTERN(Hardsigmoid, paddle::dialect::HardsigmoidOp) @@ -1308,6 +1309,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ADD_PATTERN(Gelu) ADD_PATTERN(Shape) ADD_PATTERN(Expand) + ADD_PATTERN(ExpandAs) ADD_PATTERN(Sigmoid) ADD_PATTERN(Sqrt) ADD_PATTERN(Hardsigmoid) diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 27dfc8f9a357f..4af300760df5e 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -180,17 +180,19 @@ def convert_subgraph_to_trt(self, program, group_op): continue define_op_name = source.get_defining_op().name() if define_op_name == "builtin.combine": + operand_list = [] for combined_operand in source.get_defining_op().operands(): combined_source = combined_operand.source() combined_source_id = combined_source.id if combined_source_id in value_to_trt_tensor: - operands.append( + operand_list.append( value_to_trt_tensor[combined_source_id] ) else: raise RuntimeError( f'{combined_source_id} not found in value_to_trt_tensor' ) + operands.append(operand_list) else: source_id = source.id if source_id in value_to_trt_tensor: diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py index edcea52ad4bfe..0e482697a34b7 100644 --- a/python/paddle/tensorrt/converter_utils.py +++ b/python/paddle/tensorrt/converter_utils.py @@ -31,6 +31,9 @@ __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' ) +version = trt.__version__ +version_list = list(map(int, version.split('.'))) + def has_dynamic_shape(shape): return any(s == -1 for s in shape) @@ -159,11 +162,46 @@ def add_elementwise_layer(network, paddle_op, inputs, op_type): # Create and add 1D constant layer def add_1D_constant_layer(network, data, dtype=np.int32): - constant_data = np.array([data], dtype=dtype) + if not isinstance(data, list): + data = [data] + constant_data = np.array(data, dtype=dtype) constant_layer = network.add_constant(constant_data.shape, constant_data) return constant_layer.get_output(0) +# Concat not make rank changed +def trt_concat(network, inputs, axis=0): + concat_layer = network.add_concatenation(inputs=inputs) + if axis != 0: + concat_layer.axis = axis + return concat_layer.get_output(0) + + +def trt_cast(network, input, dtype): + identity_layer = network.add_identity(input) + identity_layer.set_output_type(0, dtype) + identity_layer.get_output(0).dtype = dtype + return identity_layer.get_output(0) + + +def trt_shape(network, input): + shape_layer = network.add_shape(input) + if version_list[0] >= 10: # trt_version >=10 + return trt_cast(network, shape_layer.get_output(0), trt.int32) + return shape_layer.get_output(0) + + +def trt_reshape(network, input, new_shape, name="", is_shape_tensor=False): + reshape_layer = network.add_shuffle(input) + if is_shape_tensor: + reshape_layer.set_input(1, new_shape) + else: + reshape_layer.reshape_dims = new_shape + if name != "": + reshape_layer.name = name + return reshape_layer.get_output(0) + + # Get element tensor of 1D shape tensor def get_shape_tensor_element(network, x, index): assert index >= 0, ( diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py index 6c935e93a9020..f8b1282ecb23f 100644 --- a/python/paddle/tensorrt/impls/manipulation.py +++ b/python/paddle/tensorrt/impls/manipulation.py @@ -22,8 +22,11 @@ get_positive_dim, get_shape_tensor_element, has_dynamic_shape, + trt_concat, trt_max, trt_min, + trt_reshape, + trt_shape, trt_sub, trt_sum, ) @@ -156,8 +159,8 @@ def flatten_converter(network, paddle_op, inputs): # In the converter, pd_op.concat has three inputs, because builtin.combine has two inputs. @converter_registry.register("pd_op.concat", trt_version="8.x") def concat_converter(network, paddle_op, inputs): - input_tensors = inputs[:-1] - axis_tensor = inputs[-1] + input_tensors = inputs[0] + axis_tensor = inputs[1] concat_layer = network.add_concatenation(inputs=input_tensors) axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"] @@ -214,6 +217,76 @@ def squeeze_converter(network, paddle_op, inputs): return layer.get_output(0) +def get_expand_output(network, input, rank, shape_tensor, shape_rank): + if rank < shape_rank: + one_rank_tensor = add_1D_constant_layer( + network, [1] * (shape_rank - rank) + ) + in_shape_tensor = trt_shape(network, input) + itensors = [one_rank_tensor, in_shape_tensor] + input_shape_tensor = trt_concat(network, itensors) + else: + input_shape_tensor = trt_shape(network, input) + + new_input_tensor = trt_reshape(network, input, input_shape_tensor, "", True) + + start = [0] * shape_rank + starts_tensor = add_1D_constant_layer(network, start) + one_tensor = add_1D_constant_layer(network, 1) + sizes_tensor = trt_max(network, input_shape_tensor, shape_tensor) + input_sub_tensor = trt_sub(network, input_shape_tensor, one_tensor) + strides_tensor = trt_min(network, one_tensor, input_sub_tensor) + + slice_layer = network.add_slice( + new_input_tensor, start, [0] * len(start), [0] * len(start) + ) + slice_layer.set_input(1, starts_tensor) + slice_layer.set_input(2, sizes_tensor) + slice_layer.set_input(3, strides_tensor) + + return slice_layer.get_output(0) + + +@converter_registry.register("pd_op.expand", trt_version="8.x") +def expand_converter(network, paddle_op, inputs): + input = inputs[0] + input_dims = input.shape + rank = len(input_dims) + paddle_shape_tensor = paddle_op.operands()[1].source() + + shape_tensor_source_op = paddle_shape_tensor.get_defining_op() + if shape_tensor_source_op.name() == "pd_op.full_int_array": + shape = shape_tensor_source_op.attrs()["value"] + shape_tensor = add_1D_constant_layer(network, shape) + shape_rank = len(shape) + elif paddle_shape_tensor.type().as_vec_type(): + shape_tensors = inputs[1] + shape_rank = len(shape_tensors) + shape_tensor = trt_concat(network, shape_tensors) + else: + shape_tensor = inputs[1] + shape_rank = shape_tensor.shape[0] + return get_expand_output(network, input, rank, shape_tensor, shape_rank) + + +@converter_registry.register("pd_op.expand_as", trt_version="8.x") +def expand_as_converter(network, paddle_op, inputs): + input = inputs[0] + input_dims = input.shape + rank = len(input_dims) + y = paddle_op.operands()[1].source() + + if y.initialized(): + y_t = inputs[1] + shape_tensor = trt_shape(network, y_t) + shape_rank = len(y_t.shape) + else: + shape = paddle_op.attrs().get("target_shape") + shape_tensor = add_1D_constant_layer(network, shape) + shape_rank = len(shape) + return get_expand_output(network, input, rank, shape_tensor, shape_rank) + + @converter_registry.register("pd_op.cast", trt_version="8.x") @converter_registry.register("pd_op.cast_", trt_version="8.x") def cast_converter(network, paddle_op, inputs): @@ -270,7 +343,7 @@ def slice_converter(network, paddle_op, inputs): len(starts), len(axes), ) - for idx in axes: + for idx in range(len(axes)): if starts[idx] < 0: starts_tensor[axes[idx]] = trt_max( network, @@ -293,7 +366,7 @@ def slice_converter(network, paddle_op, inputs): ) else: starts = inputs[1] - for idx in axes: + for idx in range(len(axes)): starts_tensor[axes[idx]] = get_shape_tensor_element( network, starts, idx ) @@ -306,7 +379,7 @@ def slice_converter(network, paddle_op, inputs): len(ends), len(axes), ) - for idx in axes: + for idx in range(len(axes)): if ends[idx] < 0: ends_tensor[axes[idx]] = trt_max( network, @@ -329,7 +402,7 @@ def slice_converter(network, paddle_op, inputs): ) else: ends = inputs[2] - for idx in axes: + for idx in range(len(axes)): ends_tensor[axes[idx]] = get_shape_tensor_element( network, ends, idx ) diff --git a/test/tensorrt/test_converter_manipulation.py b/test/tensorrt/test_converter_manipulation.py index 15af4f2e30996..ccfee02e18281 100644 --- a/test/tensorrt/test_converter_manipulation.py +++ b/test/tensorrt/test_converter_manipulation.py @@ -92,6 +92,36 @@ def test_trt_result(self): self.check_trt_result() +class TestExpandTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.expand + self.api_args = { + "x": np.random.randn(1, 3).astype("float32"), + "shape": [6, 3], + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 3]} + self.max_shape = {"x": [6, 3]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestExpandWithShapeTensorTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.expand + self.api_args = { + "x": np.random.randn(1, 3).astype("float32"), + "shape": np.array([6, 3]).astype("int32"), + } + self.program_config = {"feed_list": ["x", "shape"]} + self.min_shape = {"x": [1, 3]} + self.max_shape = {"x": [6, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def slice_api(x, axes, starts, ends, infer_flags, decrease_axis): return _C_ops.slice(x, axes, starts, ends, infer_flags, decrease_axis) @@ -115,6 +145,21 @@ def test_trt_result(self): self.check_trt_result() +class TestExpandWithDiffRankTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.expand + self.api_args = { + "x": np.array([1, 2, 3]).astype("float32"), + "shape": [2, 3], + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {} + self.max_shape = {} + + def test_trt_result(self): + self.check_trt_result() + + class TestSliceTRTPattern(TensorRTBaseTest): def setUp(self): self.python_api = paddle.slice @@ -132,6 +177,23 @@ def test_trt_result(self): self.check_trt_result() +class TestExpandAsTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.expand_as + self.api_args = { + "x": np.array([[1, 2, 3]]).astype("float32"), + "y": np.array([[1, 2, 3], [4, 5, 6], [1, 2, 3], [4, 5, 6]]).astype( + "int32" + ), + } + self.program_config = {"feed_list": ["x", "y"]} + self.min_shape = {"x": [1, 3]} + self.max_shape = {"x": [4, 3]} + + def test_trt_result(self): + self.check_trt_result() + + class TestSliceWithInputStartTRTPattern(TensorRTBaseTest): def setUp(self): self.python_api = paddle.slice From a521f82a4b6dd3c36d1cabfde95d52e93590e177 Mon Sep 17 00:00:00 2001 From: Chang Lu <55493212+AndSonder@users.noreply.github.com> Date: Wed, 9 Oct 2024 19:00:56 +0800 Subject: [PATCH 046/135] [AutoParallel] Fit vpp for amp (#68545) * add enable send recv * fix for vpp when open fused pass * fix * fix * Update ir_backward.py * format code * fix vpp when using amp master grad * recover third_party * fix grad merge * recover some changes --- python/paddle/amp/grad_scaler.py | 14 +++++++- .../auto_parallel/static/engine.py | 4 ++- .../auto_parallel/static/pir_pass.py | 34 ++++++++++++------- .../reshard_funcs/sub_to_global_mesh_func.py | 5 ++- 4 files changed, 42 insertions(+), 15 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 281c76ddc4827..4ba1524a307d9 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -238,7 +238,19 @@ def scale(self, var: Tensor) -> Tensor: var = var.astype('float32') if not self._use_dynamic_loss_scaling: return var - return var * self._scale + scale_out = paddle._C_ops.multiply(var, self._scale) + multiply_op = scale_out.get_defining_op() + src_var_op = var.get_defining_op() + if multiply_op.dist_attr and src_var_op.dist_attr: + multiply_op.dist_attr = ( + paddle.base.libpaddle.pir.create_op_dist_attribute( + multiply_op.dist_attr.process_mesh, + multiply_op.dist_attr.operands(), + multiply_op.dist_attr.results(), + src_var_op.dist_attr.chunk_id, + ) + ) + return scale_out # NOTE(lizhiyu): We hack here to avoid changing the `dist_attr` of `self._scale` of 'no-calculation-rank' if not self._enable or not var._is_initialized(): diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 40f6679ea8d46..aec38a996cc10 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -732,7 +732,9 @@ def _parallel_pir(self, mode): self._strategy.pipeline.enable and self._strategy.pipeline.schedule_mode == "VPP" ): - complete_chunk_id(dist_program, self._strategy.pipeline) + complete_chunk_id( + dist_program, startup_program, self._strategy.pipeline + ) # Step 1.2: pir backward if mode == "train" and self._loss and self._optimizer: diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py index c6704ac9e625c..be3703bc69ee6 100644 --- a/python/paddle/distributed/auto_parallel/static/pir_pass.py +++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py @@ -24,6 +24,7 @@ from paddle.base.log_helper import get_logger from paddle.distributed.fleet.meta_optimizers.common import OpRole from paddle.distributed.passes.pass_base import PassContext, new_pass +from paddle.distributed.passes.pass_utils import infer_chunk_id from .mix_to_dist_pass import dist_skip_op_list from .process_group import get_process_group @@ -811,7 +812,7 @@ def set_process_mesh(vars, attrs): ) -def complete_chunk_id(dist_program, pipeline_strategy): +def complete_chunk_id(dist_program, startup_program, pipeline_strategy): if not pipeline_strategy.enable: return @@ -975,6 +976,14 @@ def complete_chunk_id(dist_program, pipeline_strategy): # Step6: add reshard op between pipeline chunks apply_partition_pass(dist_program) + for op in startup_program.global_block().ops: + if op.name() == "builtin.set_parameter": + param_name = op.str_attr("parameter_name") + startup_param = op.operand_source(0) + param = dist_program.get_parameter_value_by_name(param_name) + if param.dist_attr(): + startup_param.update_dist_attr(param.dist_attr()) + def check_chunk_id(dist_program): all_ops = dist_program.global_block().ops @@ -993,22 +1002,23 @@ def check_chunk_id(dist_program): all_used_ops = op.result(0).all_used_ops() for used_op in all_used_ops: if used_op.dist_attr.chunk_id != -1: - op.dist_attr = paddle.base.libpaddle.pir.create_op_dist_attribute( - op.dist_attr.process_mesh, - op.dist_attr.operands(), - op.dist_attr.results(), - used_op.dist_attr.chunk_id, + op.dist_attr = copy_op_attr_with_new_member( + op.dist_attr, + new_chunk_id=used_op.dist_attr.chunk_id, ) break - if op.dist_attr.chunk_id == -1: - raise ValueError( - f"The chunk_id of op[{op.name()}] is not set. Please check the chunk_id setting." - ) + else: - raise ValueError( - f"The chunk_id of op[{op.name()}] is not set. Please check the chunk_id setting." + op_chunk_id = infer_chunk_id(idx, all_ops) + op.dist_attr = copy_op_attr_with_new_member( + op.dist_attr, new_chunk_id=op_chunk_id ) + if op.dist_attr.chunk_id == -1: + raise ValueError( + f"The chunk_id of op[{op.name()}] is not set. Please check the chunk_id setting." + ) + def check_order(op_list, order): pointer = 0 diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py index 1aab837adbdde..dd2418fcec65b 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py @@ -63,6 +63,9 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): if cur_rank == root_rank: # the root rank will broadcast the src_value to other ranks + chunk_id = -1 + if src_value.get_defining_op().dist_attr: + chunk_id = src_value.get_defining_op().dist_attr.chunk_id tmp_value = paddle._C_ops.share_data_(src_value) value_type = paddle.base.libpaddle.pir.cvt_to_dist_type( src_value.type(), src_value.dist_attr() @@ -70,7 +73,7 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): tmp_value.set_type(value_type) op = tmp_value.get_defining_op() op.dist_attr = paddle.base.libpaddle.pir.create_op_dist_attribute( - src_mesh, [src_dist_attr], [src_dist_attr] + src_mesh, [src_dist_attr], [src_dist_attr], chunk_id ) else: # create the buffer on other ranks for receving the data From 1f9ff4307ac27a9768e9150923ae5ed6d86eb573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=A0=E7=8C=9C?= Date: Wed, 9 Oct 2024 20:23:40 +0800 Subject: [PATCH 047/135] =?UTF-8?q?=E3=80=90Error=20Message=E3=80=91=20Mis?= =?UTF-8?q?cellaneous=20Modifications.2=20(#68578)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove unnecessary files and adjusted corresponding CMakeList * fix error in test\cpp\inference\api\xpu_config_resnet50_test.cc --- paddle/cinn/ir/schedule/CMakeLists.txt | 10 +--- paddle/cinn/ir/schedule/ir_schedule.cc | 1 - paddle/cinn/ir/schedule/ir_schedule_error.cc | 39 ------------- paddle/cinn/ir/schedule/ir_schedule_error.h | 56 ------------------- paddle/cinn/ir/schedule/ir_schedule_util.h | 2 +- test/cpp/inference/api/tester_helper.h | 2 +- .../inference/api/xpu_config_resnet50_test.cc | 3 +- 7 files changed, 5 insertions(+), 108 deletions(-) delete mode 100644 paddle/cinn/ir/schedule/ir_schedule_error.cc delete mode 100644 paddle/cinn/ir/schedule/ir_schedule_error.h diff --git a/paddle/cinn/ir/schedule/CMakeLists.txt b/paddle/cinn/ir/schedule/CMakeLists.txt index 4474d1988de94..dc8b5ec35a5d3 100755 --- a/paddle/cinn/ir/schedule/CMakeLists.txt +++ b/paddle/cinn/ir/schedule/CMakeLists.txt @@ -1,14 +1,8 @@ cinn_proto_library(schedule_desc_proto SRCS schedule_desc.proto) core_gather_headers() -gather_srcs( - cinnapi_src - SRCS - schedule_base.cc - ir_schedule.cc - ir_schedule_util.cc - ir_schedule_error.cc - schedule_desc.cc) +gather_srcs(cinnapi_src SRCS schedule_base.cc ir_schedule.cc + ir_schedule_util.cc schedule_desc.cc) add_subdirectory(impl) diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc index 9beafd9a88f6b..57d78ac2def66 100644 --- a/paddle/cinn/ir/schedule/ir_schedule.cc +++ b/paddle/cinn/ir/schedule/ir_schedule.cc @@ -37,7 +37,6 @@ #include "paddle/cinn/ir/ir_visitor.h" #include "paddle/cinn/ir/op/ir_operators.h" #include "paddle/cinn/ir/schedule/impl/ir_schedule.h" -#include "paddle/cinn/ir/schedule/ir_schedule_error.h" #include "paddle/cinn/ir/schedule/ir_schedule_util.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/lang/compute.h" diff --git a/paddle/cinn/ir/schedule/ir_schedule_error.cc b/paddle/cinn/ir/schedule/ir_schedule_error.cc deleted file mode 100644 index 0b7a098264632..0000000000000 --- a/paddle/cinn/ir/schedule/ir_schedule_error.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2023 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/cinn/ir/schedule/ir_schedule_error.h" -#include "paddle/cinn/ir/ir.h" -#include "paddle/cinn/ir/ir_printer.h" - -namespace cinn { -namespace ir { - -std::string IRScheduleErrorHandler::GeneralErrorMessage() const { - std::ostringstream os; - os << "[IRScheduleError] An error occurred in the schedule primitive < " - << this->primitive_ << " >. " << std::endl; - os << indent_str_ << "[Error info] " << this->err_msg_; - return os.str(); -} - -std::string IRScheduleErrorHandler::DetailedErrorMessage() const { - std::ostringstream os; - os << GeneralErrorMessage(); - os << indent_str_ << "[Expr info] The Expr of current schedule is:\n" - << this->module_expr_.GetExprs() << std::endl; - return os.str(); -} - -} // namespace ir -} // namespace cinn diff --git a/paddle/cinn/ir/schedule/ir_schedule_error.h b/paddle/cinn/ir/schedule/ir_schedule_error.h deleted file mode 100644 index 1326bfd8852b0..0000000000000 --- a/paddle/cinn/ir/schedule/ir_schedule_error.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2023 CINN Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/cinn/ir/schedule/ir_schedule.h" - -namespace cinn { -namespace ir { - -/** - * This handler is dealing with the errors happen in in the current - * Scheduling. - */ -class IRScheduleErrorHandler : public utils::ErrorHandler { - public: - /** - * \brief constructor - * \param err_msg the error message - */ - explicit IRScheduleErrorHandler(const std::string& primitive, - const std::string& err_msg, - const ModuleExpr& module_expr) - : primitive_(primitive), err_msg_(err_msg), module_expr_(module_expr) {} - - /** - * \brief Returns a short error message corresponding to the kGeneral error - * level. - */ - std::string GeneralErrorMessage() const; - - /** - * \brief Returns a detailed error message corresponding to the kDetailed - * error level. - */ - std::string DetailedErrorMessage() const; - - private: - std::string primitive_; - std::string err_msg_; - ModuleExpr module_expr_; -}; - -} // namespace ir -} // namespace cinn diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h index 36183bf6248e2..8b9d69b5dde6f 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.h +++ b/paddle/cinn/ir/schedule/ir_schedule_util.h @@ -27,7 +27,7 @@ #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/op/ir_operators.h" -#include "paddle/cinn/ir/schedule/ir_schedule_error.h" +#include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/tensor.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/optim/replace_var_with_expr.h" diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h index 0d83878057c26..e199a7c78d7ec 100644 --- a/test/cpp/inference/api/tester_helper.h +++ b/test/cpp/inference/api/tester_helper.h @@ -802,7 +802,7 @@ void CompareAccuracy( "[Error info] avg_acc_ref - avg_acc_quant must be less than or " "equal to FLAGS_quantized_accuracy.\n" "[Condition info] Please check your input data.")); - } // test + } } void CompareDeterministic( diff --git a/test/cpp/inference/api/xpu_config_resnet50_test.cc b/test/cpp/inference/api/xpu_config_resnet50_test.cc index fcb81d6b1cb16..86a79c9aa20cd 100644 --- a/test/cpp/inference/api/xpu_config_resnet50_test.cc +++ b/test/cpp/inference/api/xpu_config_resnet50_test.cc @@ -72,8 +72,7 @@ TEST(xpu_config, inference) { config.SetXpuConfig(xpu_config); XpuConfig xpu_config_test = config.xpu_config(); - CHECK_EQ(xpu_config_test.l3_size, l3_size); - PADDLE_ENFORCE_GT(xpu_config_test.l3_size, + PADDLE_ENFORCE_EQ(xpu_config_test.l3_size, l3_size, common::errors::InvalidArgument( "xpu_config_test.l3_size %d is different from our " From 2aebcd88c95ac8c2d917f6485c295f8837f71131 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 9 Oct 2024 20:28:44 +0800 Subject: [PATCH 048/135] Add FLAGS_force_sync_ops for executor (#68467) * Add sync_op_after_launch config for executor * Update code * Update code * Update code --- .../instruction/instruction_base.cc | 2 - .../instruction/instruction_base.h | 11 ++- .../interpreter/execution_config.cc | 76 +++++++++++++++++++ .../interpreter/execution_config.h | 8 +- .../framework/new_executor/pir_interpreter.cc | 32 +++++++- .../framework/new_executor/pir_interpreter.h | 2 + .../new_executor/standalone_executor.cc | 2 + 7 files changed, 127 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index a22bed3cf14f4..bfcc02cdd5179 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -197,8 +197,6 @@ InstructionBase::InstructionBase(size_t id, const phi::Place& place) no_need_buffer_values_() { id_ = id; - is_artificial_ = false; - if (phi::is_cpu_place(place)) { type_ = OpFuncType::kCpuSync; } else { diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index b39c3553bbcfa..d2424bbdd6738 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -43,6 +43,9 @@ class InstructionBase { bool IsArtificial() const { return is_artificial_; } void SetArtificial(bool is_artificial) { is_artificial_ = is_artificial; } + bool IsSyncAfterLaunch() const { return sync_after_launch_; } + void SetSyncAfterLaunch(bool sync) { sync_after_launch_ = sync; } + OpFuncType KernelType() const; void SetKernelType(OpFuncType type) { type_ = type; } @@ -176,8 +179,12 @@ class InstructionBase { protected: size_t id_; - bool is_artificial_; // Instruction is artificial means that it is only used - // to assist scheduling and no need to be executed. + bool is_artificial_{ + false}; // Instruction is artificial means that it is only used + // to assist scheduling and no need to be executed. + + bool sync_after_launch_{false}; + OpFuncType type_; // dist attrs:lower value, higher priority diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc index e118f47d42e7c..0c9a7dc320d92 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -17,10 +17,25 @@ #include #include +#include "paddle/common/flags.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/xpu/xpu_info.h" +#include "paddle/utils/string/string_helper.h" + +// FLAGS_force_sync_ops is used to finer control the op-sync in executor. +// The format is: "micro_batch_id, job_name, op_id, op_name | micro_batch_id, +// job_name, op_id, op_name | ...". Keep spaces to syncs all name/id. Example: +// 1. sync the recv_v2 op in the second backward-job of 1F1B scheduling: +// FLAGS_force_sync_ops="1, backward, , recv_v2" +// 2. sync the full op with op_id=5: FLAGS_force_sync_ops=" , , 5, full" +// 3. sync all ops in the first default-job: FLAGS_force_sync_ops="0,default,, +// 4. sync all ops in the forward-job and backward-job: FLAGS_force_sync_ops=" , +// forward, , | , backward, , , " +PHI_DEFINE_EXPORTED_string(force_sync_ops, + "", + "Pattern to force sync ops in executor."); PD_DECLARE_bool(new_executor_serial_run); @@ -149,4 +164,65 @@ void ExecutionConfig::Log(int log_level) { VLOG(log_level) << log_str.str(); } +std::set> GetForceSyncOps( + int micro_batch_id, const std::string& job_name) { + std::set> force_sync_ops; + std::stringstream ss(paddle::string::erase_spaces(FLAGS_force_sync_ops)); + std::string item; + + while (std::getline(ss, item, '|')) { + item += ","; // The comma at the end of the string will be ignored in + // std::getline + std::stringstream item_stream(item); + std::vector tokens; + std::string token; + while (std::getline(item_stream, token, ',')) { + VLOG(1) << "token: " << token; + tokens.push_back(token); + } + + PADDLE_ENFORCE_EQ( + tokens.size(), + 4, + phi::errors::InvalidArgument("Invalid force_sync_ops format: \"%s\", " + "FLAGS_force_sync_ops=\"%s\"", + item, + FLAGS_force_sync_ops)); + + int micro_batch_id_; + if (tokens[0] == "") { + micro_batch_id_ = -1; + } else { + micro_batch_id_ = std::stoi(tokens[0]); + } + if (micro_batch_id_ != micro_batch_id && micro_batch_id_ != -1) { + continue; + } + + if (tokens[1] != job_name && tokens[1] != "") { + continue; + } + + int op_id; + if (tokens[2] == "") { + op_id = -1; + } else { + op_id = std::stoi(tokens[2]); + } + std::string op_name = tokens[3]; + force_sync_ops.insert({op_id, op_name}); + } + + if (!force_sync_ops.empty()) { + std::stringstream ss; + ss << "job_name: " << job_name << ", micro_batch_id: " << micro_batch_id + << ", force_sync_ops: "; + for (auto& pair : force_sync_ops) { + ss << "(" << pair.first << ", " << pair.second << ") "; + } + VLOG(6) << ss.str(); + } + return force_sync_ops; +} + } // namespace paddle::framework::interpreter diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.h b/paddle/fluid/framework/new_executor/interpreter/execution_config.h index 65c17d60ab824..b082f972682d8 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.h +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.h @@ -26,7 +26,6 @@ namespace interpreter { struct ExecutionConfig { bool create_local_scope{true}; - bool used_for_cinn{false}; bool used_for_control_flow_op{false}; bool used_for_jit{false}; @@ -35,6 +34,10 @@ struct ExecutionConfig { size_t device_num_threads{0}; size_t host_num_threads{0}; + std::set> + force_sync_ops; // set{pair}, -1 matches any op_id, "" + // matches any name + std::set force_root_scope_vars; std::set jit_input_vars; std::set skip_gc_vars; @@ -43,6 +46,9 @@ struct ExecutionConfig { void Log(int log_level); }; +std::set> GetForceSyncOps( + int micro_batch_id, const std::string& job_name); + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 2d1aadcd5fe75..f888c4c502981 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -765,6 +765,33 @@ void PirInterpreter::AnalyseExecuteOrderForTrace( } } +void PirInterpreter::AnalyzeForceSyncOps() { + for (auto& ins : vec_instruction_base_) { + ins->SetSyncAfterLaunch(FLAGS_benchmark); + + // Analyze force sync op set by FLAGS_force_sync_op + int op_id = ins->Id(); + std::string op_name = ins->Name(); + std::string unused_prefix = "pd_op."; + auto pos = op_name.find(unused_prefix); + if (pos != std::string::npos) { + op_name.erase(pos, unused_prefix.size()); + } + + for (auto& pair : execution_config_.force_sync_ops) { + int sync_op_id = pair.first; + std::string sync_op_name = pair.second; + if ((sync_op_id == op_id || sync_op_id == -1) && + (sync_op_name == op_name || sync_op_name == "")) { + VLOG(8) << "Force sync op: " + << "sync_op_id=" << sync_op_id << ", op_id=" << op_id + << ", sync_op_name=" << sync_op_name << ", op_name=" << op_name; + ins->SetSyncAfterLaunch(true); + } + } + } +} + void PirInterpreter::BuildInstruction() { VLOG(6) << "Build Instructions for pir ... "; vec_instruction_base_.clear(); @@ -1900,7 +1927,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) { instr_node->Run(); } - if (FLAGS_benchmark) { + if (instr_node->IsSyncAfterLaunch()) { instr_node->DeviceContext().Wait(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); @@ -2003,6 +2030,9 @@ void PirInterpreter::PreAnalysis() { ir_instruction_scheduling_priority_less); VLOG(4) << "Done AnalyseExecuteOrderForTrace"; + AnalyzeForceSyncOps(); + VLOG(4) << "Done AnalyzeForceSyncOps"; + UpdateSyncOpNum(); VLOG(4) << "Done UpdateSyncOpNum"; diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h index 71445425b3c38..f00dd040d9240 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.h +++ b/paddle/fluid/framework/new_executor/pir_interpreter.h @@ -129,9 +129,11 @@ class PirInterpreter : public InterpreterBaseImpl { void UpdateSyncOpNum(); void UpdateNcclOpNum(); void UpdateOneDNNOpNum(); + void AnalyseExecuteOrderForTrace( std::map> op_downstream_map, InstructionSchedulingPriorityLess compare); + void AnalyzeForceSyncOps(); void ConstructEventForJitInput(); void CalculateLastLiveOps(); diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 094e99a392dcd..624e822f1bcba 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -87,6 +87,8 @@ StandaloneExecutor::StandaloneExecutor(const phi::Place& place, interpreter::ExecutionConfig execution_config; execution_config.create_local_scope = false; execution_config.skip_gc_vars = job->SkipGcVars(); + execution_config.force_sync_ops = + interpreter::GetForceSyncOps(micro_batch_id, job_type); // TODO(phlrain) we only support cpu for now if (FLAGS_enable_pir_in_executor) { From 63c2a8292621c1b33de0b050e81e0cff13c1cb5a Mon Sep 17 00:00:00 2001 From: cubehan3 Date: Thu, 10 Oct 2024 10:01:10 +0800 Subject: [PATCH 049/135] decomp numel, swish, fmax_grad, fmin_grad (#68319) --- .../decomp_interface_gen_op_list.py | 4 + paddle/fluid/primitive/codegen/gen.py | 3 + paddle/fluid/primitive/composite/composite.h | 20 +++ paddle/fluid/primitive/rule/vjp/details.h | 94 +++++++++++ python/paddle/autograd/backward_utils.py | 3 + test/legacy_test/test_activation_op.py | 8 +- test/legacy_test/test_fmax_op.py | 18 ++- test/legacy_test/test_fmin_op.py | 18 ++- test/legacy_test/test_numel_op.py | 10 +- .../test_prim_sub_graph_dynamic_shape.py | 36 +++++ ..._sub_graph_fghij_backward_dynamic_shape.py | 148 ++++++++++++++++++ ..._sub_graph_pqrst_backward_dynamic_shape.py | 17 ++ 12 files changed, 368 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py index aa06fb8269c09..e414007f6ba5a 100644 --- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py +++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py @@ -50,6 +50,7 @@ "mean", "mean_all", "meshgrid", + "numel", "one_hot", "p_norm", "pow", @@ -59,6 +60,7 @@ "sigmoid_cross_entropy_with_logits", "silu", "swiglu", + "swish", "softmax", "softsign", "square", @@ -101,6 +103,7 @@ "mean", "mean_all", "meshgrid", + "numel", "p_norm", "pow", "reciprocal", @@ -109,6 +112,7 @@ "sigmoid_cross_entropy_with_logits", "silu", "swiglu", + "swish", "softmax", "softsign", "square", diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py index e36c66d394b81..f0e9e57c4697b 100644 --- a/paddle/fluid/primitive/codegen/gen.py +++ b/paddle/fluid/primitive/codegen/gen.py @@ -89,6 +89,8 @@ 'elementwise_pow_grad', 'maximum_grad', 'reduce_as_grad', + 'fmax_grad', + 'fmin_grad', 'dot_grad', ] @@ -150,6 +152,7 @@ 'sqrt_grad', 'stack_grad', 'swiglu', + 'swish_grad', ] # custom vjp list of composite op VJP_COMPS = PRIM_VJP + CUSTOM_VJP diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index f9b143a696114..df1c09e043f6b 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -1521,6 +1521,26 @@ std::vector unstack_decomp(const Tensor& x, int axis, const int num) { return res; } +template +Tensor numel_decomp(const Tensor& x) { + auto x_shape = x.shape(); + if (has_dynamic_shape(x_shape)) { + const Tensor x_shape_tensor = shape(x); + Tensor value = full({1}, 1, x_shape_tensor.dtype()); + for (size_t i = 0; i < x_shape.size(); ++i) { + value = value * get_slice(x_shape_tensor, i); + } + return cast(reshape(value, {}), DataType::INT64); + } else { + return full_scalar(x.numel(), DataType::INT64); + } +} + +template +Tensor swish_decomp(const Tensor& x) { + return x * sigmoid(x); +} + } // namespace details } // namespace primitive diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index 091d780d488db..1dcb5141a8331 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -2750,6 +2750,100 @@ void atan_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) { } } +template +void swish_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) { + if (x_grad) { + const Tensor one = full_scalar(1.0, x.dtype()); + const Tensor sig = sigmoid(x); + Tensor res = out_grad * sig * (one + x * (one - sig)); + set_output(res, x_grad); + } +} + +template +void fmax_grad(const Tensor& x, + const Tensor& y, + const Tensor& out_grad, + Tensor* x_grad, + Tensor* y_grad) { + const Tensor nan_x = isnan(x); + const Tensor nan_y = isnan(y); + Tensor mask_x = backend::logical_or(nan_y, greater_equal(x, y)); + Tensor mask_y = backend::logical_not(mask_x); + + if (x_grad) { + Tensor dx = cast(mask_x, out_grad.dtype()) * out_grad; + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + dx = reduce_as(dx, x); + } else { + if (out_grad.dims() != x.dims()) { + auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); + Tensor dx_reduce_res = + dx.sum(common::vectorize(reduce_dim), x.dtype(), false); + dx = reshape(dx_reduce_res, common::vectorize(x.dims())); + } + } + set_output(dx, x_grad); + } + + if (y_grad) { + Tensor dy = cast(mask_y, out_grad.dtype()) * out_grad; + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + dy = reduce_as(dy, x); + } else { + if (out_grad.dims() != y.dims()) { + auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), y.dims()); + Tensor dy_reduce_res = + dy.sum(common::vectorize(reduce_dim), y.dtype(), false); + dy = reshape(dy_reduce_res, common::vectorize(y.dims())); + } + } + set_output(dy, y_grad); + } +} + +template +void fmin_grad(const Tensor& x, + const Tensor& y, + const Tensor& out_grad, + Tensor* x_grad, + Tensor* y_grad) { + const Tensor nan_x = isnan(x); + const Tensor nan_y = isnan(y); + Tensor mask_x = backend::logical_or(nan_y, less_equal(x, y)); + Tensor mask_y = backend::logical_not(mask_x); + + if (x_grad) { + Tensor dx = cast(mask_x, out_grad.dtype()) * out_grad; + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + dx = reduce_as(dx, x); + } else { + if (out_grad.dims() != x.dims()) { + auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); + Tensor dx_reduce_res = + dx.sum(common::vectorize(reduce_dim), x.dtype(), false); + dx = reshape(dx_reduce_res, common::vectorize(x.dims())); + } + } + set_output(dx, x_grad); + } + + if (y_grad) { + Tensor dy = cast(mask_y, out_grad.dtype()) * out_grad; + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + dy = reduce_as(dy, x); + } else { + if (out_grad.dims() != y.dims()) { + auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), y.dims()); + Tensor dy_reduce_res = + dy.sum(common::vectorize(reduce_dim), y.dtype(), false); + dy = reshape(dy_reduce_res, common::vectorize(y.dims())); + } + } + set_output(dy, y_grad); + } +} + template void dot_grad(const Tensor& x, const Tensor& y, diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py index f4cee95036f4e..643cf11abe6c3 100644 --- a/python/paddle/autograd/backward_utils.py +++ b/python/paddle/autograd/backward_utils.py @@ -46,6 +46,8 @@ "pd_op.exp", "pd_op.expand", "pd_op.floor", + "pd_op.fmax", + "pd_op.fmin", "pd_op.gather", "pd_op.gather_nd", "pd_op.gelu", @@ -83,6 +85,7 @@ "pd_op.subtract", "pd_op.sum", "pd_op.swiglu", + "pd_op.swish", "pd_op.tanh", "pd_op.topk", "pd_op.unsqueeze", diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 383fc16eeb7a0..6530787618b2a 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -5224,7 +5224,9 @@ def ref_swish(x): class TestSwish(TestActivation): def setUp(self): self.op_type = "swish" + self.prim_op_type = "comp" self.python_api = paddle.nn.functional.swish + self.public_python_api = paddle.nn.functional.swish self.init_dtype() self.init_shape() @@ -5244,7 +5246,11 @@ def test_check_grad(self): if self.dtype == np.float16: return self.check_grad( - ['X'], 'Out', check_pir=True, check_pir_onednn=self.check_pir_onednn + ['X'], + 'Out', + check_pir=True, + check_pir_onednn=self.check_pir_onednn, + check_prim_pir=True, ) diff --git a/test/legacy_test/test_fmax_op.py b/test/legacy_test/test_fmax_op.py index 24dc5883d52b7..14e72aa43b83a 100644 --- a/test/legacy_test/test_fmax_op.py +++ b/test/legacy_test/test_fmax_op.py @@ -133,7 +133,9 @@ class TestElementwiseFmaxOp(OpTest): def setUp(self): """setUp""" self.op_type = "elementwise_fmax" + self.prim_op_type = "prim" self.python_api = paddle.fmax + self.public_python_api = paddle.fmax # If x and y have the same value, the max() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -149,7 +151,7 @@ def test_check_output(self): def test_check_grad_normal(self): """test_check_grad_normal""" - self.check_grad(['X', 'Y'], 'Out', check_pir=True) + self.check_grad(['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True) def test_check_grad_ignore_x(self): """test_check_grad_ignore_x""" @@ -178,7 +180,9 @@ class TestElementwiseFmax2Op(OpTest): def setUp(self): """setUp""" self.op_type = "elementwise_fmax" + self.prim_op_type = "prim" self.python_api = paddle.fmax + self.public_python_api = paddle.fmax # If x and y have the same value, the max() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -196,7 +200,7 @@ def test_check_output(self): def test_check_grad_normal(self): """test_check_grad_normal""" - self.check_grad(['X', 'Y'], 'Out', check_pir=True) + self.check_grad(['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True) def test_check_grad_ignore_x(self): """test_check_grad_ignore_x""" @@ -225,7 +229,9 @@ class TestElementwiseFmax3Op(OpTest): def setUp(self): """setUp""" self.op_type = "elementwise_fmax" + self.prim_op_type = "prim" self.python_api = paddle.fmax + self.public_python_api = paddle.fmax # If x and y have the same value, the max() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -242,7 +248,7 @@ def test_check_output(self): def test_check_grad_normal(self): """test_check_grad_normal""" - self.check_grad(['X', 'Y'], 'Out', check_pir=True) + self.check_grad(['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True) @unittest.skipIf( @@ -253,7 +259,9 @@ def test_check_grad_normal(self): class TestFmaxBF16OP(OpTest): def setUp(self): self.op_type = "elementwise_fmax" + self.prim_op_type = "prim" self.python_api = paddle.fmax + self.public_python_api = paddle.fmax self.dtype = np.uint16 x = np.random.uniform(0.1, 1, [13, 17]).astype("float32") sgn = np.random.choice([-1, 1], [13, 17]).astype("float32") @@ -271,7 +279,9 @@ def test_check_output(self): def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_pir=True) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True + ) if __name__ == "__main__": diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py index 7c7799f6452cf..9c3d0030a53d8 100644 --- a/test/legacy_test/test_fmin_op.py +++ b/test/legacy_test/test_fmin_op.py @@ -135,7 +135,9 @@ class TestElementwiseFminOp(OpTest): def setUp(self): """setUp""" self.op_type = "elementwise_fmin" + self.prim_op_type = "prim" self.python_api = paddle.fmin + self.public_python_api = paddle.fmin # If x and y have the same value, the min() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -151,7 +153,7 @@ def test_check_output(self): def test_check_grad_normal(self): """test_check_grad_normal""" - self.check_grad(['X', 'Y'], 'Out', check_pir=True) + self.check_grad(['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True) def test_check_grad_ignore_x(self): """test_check_grad_ignore_x""" @@ -180,7 +182,9 @@ class TestElementwiseFmin2Op(OpTest): def setUp(self): """setUp""" self.op_type = "elementwise_fmin" + self.prim_op_type = "prim" self.python_api = paddle.fmin + self.public_python_api = paddle.fmin # If x and y have the same value, the min() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -198,7 +202,7 @@ def test_check_output(self): def test_check_grad_normal(self): """test_check_grad_normal""" - self.check_grad(['X', 'Y'], 'Out', check_pir=True) + self.check_grad(['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True) def test_check_grad_ignore_x(self): """test_check_grad_ignore_x""" @@ -227,7 +231,9 @@ class TestElementwiseFmin3Op(OpTest): def setUp(self): """setUp""" self.op_type = "elementwise_fmin" + self.prim_op_type = "prim" self.python_api = paddle.fmin + self.public_python_api = paddle.fmin # If x and y have the same value, the min() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -244,7 +250,7 @@ def test_check_output(self): def test_check_grad_normal(self): """test_check_grad_normal""" - self.check_grad(['X', 'Y'], 'Out', check_pir=True) + self.check_grad(['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True) @unittest.skipIf( @@ -255,7 +261,9 @@ def test_check_grad_normal(self): class TestFminBF16OP(OpTest): def setUp(self): self.op_type = "elementwise_fmin" + self.prim_op_type = "prim" self.python_api = paddle.fmin + self.public_python_api = paddle.fmin self.dtype = np.uint16 x = np.random.uniform(1, 1, [13, 17]).astype("float32") sgn = np.random.choice([-1, 1], [13, 17]).astype("float32") @@ -273,7 +281,9 @@ def test_check_output(self): def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_pir=True) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True + ) if __name__ == "__main__": diff --git a/test/legacy_test/test_numel_op.py b/test/legacy_test/test_numel_op.py index 0f080089ab08c..103fdb765fe67 100644 --- a/test/legacy_test/test_numel_op.py +++ b/test/legacy_test/test_numel_op.py @@ -24,7 +24,9 @@ class TestNumelOp(OpTest): def setUp(self): self.op_type = "size" + self.prim_op_type = "comp" self.python_api = paddle.numel + self.public_python_api = paddle.numel self.init() x = np.random.random(self.shape).astype(self.dtype) self.inputs = { @@ -33,7 +35,7 @@ def setUp(self): self.outputs = {'Out': np.array(np.size(x))} def test_check_output(self): - self.check_output(check_pir=True) + self.check_output(check_pir=True, check_prim_pir=True) def init(self): self.shape = (6, 56, 8, 55) @@ -85,7 +87,9 @@ def init(self): class TestNumelOpComplex(TestNumelOp): def setUp(self): self.op_type = "size" + self.prim_op_type = "comp" self.python_api = paddle.numel + self.public_python_api = paddle.numel self.init() x = np.random.random(self.shape).astype( self.dtype @@ -138,7 +142,9 @@ def init(self): class TestNumelOpBF16(OpTest): def setUp(self): self.op_type = "size" + self.prim_op_type = "comp" self.python_api = paddle.numel + self.public_python_api = paddle.numel self.dtype = np.uint16 self.init() x = np.random.random(self.shape).astype(np.float32) @@ -147,7 +153,7 @@ def setUp(self): def test_check_output(self): place = paddle.CUDAPlace(0) - self.check_output_with_place(place, check_pir=True) + self.check_output_with_place(place, check_pir=True, check_prim_pir=True) def init(self): self.shape = (6, 56, 8, 55) diff --git a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py index 7319217a16e9c..3e91bdbeea409 100644 --- a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py +++ b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py @@ -334,6 +334,14 @@ def unstack_net6(x): return paddle.unstack(x, axis=-1) +def numel_net(x): + return paddle.numel(x) + + +def swish_net(x): + return paddle.nn.functional.swish(x) + + def one_hot_net(x): return paddle.nn.functional.one_hot(x, 10) @@ -1668,6 +1676,34 @@ def setUp(self): self.tol = 1e-6 +class TestPrimNumel(TestPrimBase): + def setUp(self): + np.random.seed(2024) + paddle.seed(2024) + self.shape_x = [5, 10, 15] + self.dtype_x = "float32" + self.init_x_shape = [None, None, None] + self.x = np.random.random(self.shape_x).astype(self.dtype_x) + self.net = numel_net + self.necessary_ops = "pd_op.numel" + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimSwish(TestPrimBase): + def setUp(self): + np.random.seed(2024) + paddle.seed(2024) + self.shape_x = [2, 300, 2048] + self.dtype_x = "float32" + self.init_x_shape = [None, None, None] + self.x = np.random.random(self.shape_x).astype(self.dtype_x) + self.net = swish_net + self.necessary_ops = "pd_op.swish" + self.enable_cinn = False + self.tol = 1e-6 + + class TestPrimOneHot(TestPrimBase): def setUp(self): np.random.seed(2024) diff --git a/test/prim/pir_prim/test_prim_sub_graph_fghij_backward_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_fghij_backward_dynamic_shape.py index 3b056c1c45e72..1fadab7248e18 100644 --- a/test/prim/pir_prim/test_prim_sub_graph_fghij_backward_dynamic_shape.py +++ b/test/prim/pir_prim/test_prim_sub_graph_fghij_backward_dynamic_shape.py @@ -30,6 +30,14 @@ def floor_net(x): return paddle.floor(x) +def fmax_net(x, y): + return paddle.fmax(x, y) + + +def fmin_net(x, y): + return paddle.fmin(x, y) + + def gather_net(x, y): return paddle.gather(x, y, 1) @@ -63,6 +71,146 @@ def setUp(self): self.tol = 1e-6 +class TestPrimFmaxWithGrad1(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmax_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.y[2, 10, 5:] = np.nan + self.net = fmax_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFmaxWithGrad2(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmax_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [30, 200, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.x[3, 100, 20:] = np.nan + self.net = fmax_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFmaxWithGrad3(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmax_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [30, 200, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.y[2, 10, 5:] = np.nan + self.net = fmax_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFmaxWithGrad4(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmax_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [30, 200, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.x[2, 10, 5:] = np.nan + self.y[2, 10, 5:] = np.nan + self.x[10, 9, :] = self.y[10, 9, :] + self.net = fmax_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFminWithGrad1(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmin_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.y[2, 10, 5:] = np.nan + self.net = fmin_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFminWithGrad2(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmin_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [30, 200, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.x[3, 100, 20:] = np.nan + self.net = fmin_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFminWithGrad3(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmin_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [30, 200, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.y[2, 10, 5:] = np.nan + self.net = fmin_net + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimFminWithGrad4(TestPrimTwoWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.fmin_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [30, 200, 40] + self.y_shape = [30, 200, 40] + self.init_y_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.x[2, 10, 5:] = np.nan + self.y[2, 10, 5:] = np.nan + self.x[10, 9, :] = self.y[10, 9, :] + self.net = fmin_net + self.enable_cinn = False + self.tol = 1e-6 + + class TestPrimGatherWithGrad1(TestPrimTwoWithGrad): def setUp(self): np.random.seed(2024) diff --git a/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py index 4a923f7383712..fb56382c99b6f 100644 --- a/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py +++ b/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py @@ -162,6 +162,10 @@ def swiglu_net2(x): return paddle.incubate.nn.functional.swiglu(x) +def swish_net(x): + return paddle.nn.functional.swish(x) + + def tanh_net(x): return paddle.tanh(x) @@ -1022,6 +1026,19 @@ def setUp(self): self.tol = 1e-6 +class TestPrimSwishWithGrad(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2023) + self.op_name = "pd_op.swish_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, None] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = swish_net + self.enable_cinn = False + self.tol = 1e-6 + + class TestPrimTanhWithGrad(TestPrimBaseWithGrad): def setUp(self): np.random.seed(2024) From 40224de5211b16c9ed73122ffc85789c84811c82 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:16:18 +0800 Subject: [PATCH 050/135] add new headers to whl,test=document_fix (#68591) --- python/setup.py.in | 9 ++++++++- setup.py | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index 4172192081b83..cef2c5a54cd6d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -1133,7 +1133,14 @@ headers = ( list(find_files('op_yaml_info_util.h','@PADDLE_SOURCE_DIR@/paddle/fluid/pir/dialect/operator/utils/'))+ list(find_files('op_yaml_info_parser.h','@PADDLE_SOURCE_DIR@/paddle/fluid/pir/dialect/operator/utils/'))+ list(find_files('utils.h','@PADDLE_SOURCE_DIR@/paddle/fluid/pir/dialect/operator/utils/'))+ - list(find_files('op_compat_info.h','@PADDLE_SOURCE_DIR@/paddle/fluid/ir_adaptor/translator/'))) + list(find_files('op_compat_info.h','@PADDLE_SOURCE_DIR@/paddle/fluid/ir_adaptor/translator/'))+ + list(find_files('op_yaml_info_parser.h','@PADDLE_SOURCE_DIR@/paddle/fluid/pir/dialect/operator/utils/'))+ + list(find_files('vjp.h','@PADDLE_SOURCE_DIR@/paddle/fluid/pir/dialect/operator/interface/"'))+ + list(find_files('infer_symbolic_shape.h','@PADDLE_SOURCE_DIR@/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/'))+ + #pir headers + list(find_files('lexer.h','@PADDLE_SOURCE_DIR@/paddle/pir/src/core/parser/'))+ + list(find_files('token.h','@PADDLE_SOURCE_DIR@/paddle/pir/src/core/parser/'))) + jit_layer_headers = ['layer.h', 'serializer.h', 'serializer_utils.h', 'all.h', 'function.h'] for f in jit_layer_headers: headers += list(find_files(f, '@PADDLE_SOURCE_DIR@/paddle/fluid/jit', recursive=True)) diff --git a/setup.py b/setup.py index d820752599b08..232c351f76d40 100644 --- a/setup.py +++ b/setup.py @@ -1760,6 +1760,32 @@ def get_headers(): paddle_source_dir + '/paddle/fluid/ir_adaptor/translator/', ) ) + + list( + find_files( + 'infer_symbolic_shape.h', + paddle_source_dir + + '/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape', + ) + ) + + list( + find_files( + 'vjp.h', + paddle_source_dir + + '/paddle/fluid/pir/dialect/operator/interface', + ) + ) + + list( + find_files( + 'lexer.h', + paddle_source_dir + '/paddle/pir/src/core/parser', + ) + ) + + list( + find_files( + 'token.h', + paddle_source_dir + '/paddle/pir/src/core/parser', + ) + ) ) jit_layer_headers = [ From bda65dad6e8ff2af24002f92d7323d2978f2db69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?= <39978853+zhoutianzi666@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:22:37 +0800 Subject: [PATCH 051/135] =?UTF-8?q?[jit.inference]=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E5=88=86=E5=B8=83=E5=BC=8F=E6=8E=A8=E7=90=86=20(#68307)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [jit.inference]支持分布式推理 (#68307) --- python/paddle/incubate/jit/inference_decorator.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/paddle/incubate/jit/inference_decorator.py b/python/paddle/incubate/jit/inference_decorator.py index 8df8b927d07c6..492c2c8a0e045 100644 --- a/python/paddle/incubate/jit/inference_decorator.py +++ b/python/paddle/incubate/jit/inference_decorator.py @@ -139,6 +139,15 @@ def __init__(self, func, used_as_at_decorator, **kwargs): ) self.save_model_dir = os.path.join(self.save_model_dir, func.__name__) + import paddle.distributed as dist + + n_ranks = dist.get_world_size() + if n_ranks > 1: + local_rank: int = dist.ParallelEnv().dev_id + self.save_model_dir = os.path.join( + self.save_model_dir, f"{n_ranks}_{local_rank}" + ) + self.precision_mode = kwargs.get("precision_mode") self.switch_ir_optim = kwargs.get("switch_ir_optim") self.switch_ir_debug = kwargs.get("switch_ir_debug") @@ -164,6 +173,7 @@ def __init__(self, func, used_as_at_decorator, **kwargs): py_script = py_script[py_script.find("def") :] if used_as_at_decorator: assert self.arg_names[0] == "self" + self.save_path = os.path.join(self.save_model_dir, "infer") d2s_input_info_path = self.save_path + "_d2s_input_info.txt" d2s_input_shapes = [] @@ -310,6 +320,7 @@ def forward(self, args): input_spec=input_specs, full_graph=True, ) + paddle.jit.save(model, self.save_path, skip_prune_program=True) # save d2s_shapes From 4f4fdac434c8c17b27852587699699f883048441 Mon Sep 17 00:00:00 2001 From: Wennie396 <44974020+Wennie396@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:54:48 +0800 Subject: [PATCH 052/135] add spmd rules for p_norm (#68317) * add spmd rules for p_norm * add test for p_norm spmd rule * fix bugs * remove ut for p_norm spmd rule --- paddle/phi/infermeta/spmd_rules/p_norm.cc | 88 +++++ paddle/phi/infermeta/spmd_rules/p_norm.h | 50 +++ paddle/phi/infermeta/spmd_rules/rules.cc | 6 + paddle/phi/infermeta/spmd_rules/rules.h | 1 + paddle/phi/ops/yaml/backward.yaml | 1 + paddle/phi/ops/yaml/ops.yaml | 1 + .../spmd_rules/test_p_norm_rule.py | 333 ++++++++++++++++++ 7 files changed, 480 insertions(+) create mode 100644 paddle/phi/infermeta/spmd_rules/p_norm.cc create mode 100644 paddle/phi/infermeta/spmd_rules/p_norm.h create mode 100644 test/auto_parallel/spmd_rules/test_p_norm_rule.py diff --git a/paddle/phi/infermeta/spmd_rules/p_norm.cc b/paddle/phi/infermeta/spmd_rules/p_norm.cc new file mode 100644 index 0000000000000..d89771ee8e563 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/p_norm.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/infermeta/spmd_rules/p_norm.h" +#include "glog/logging.h" + +namespace phi { +namespace distributed { + +SpmdInfo PNormInferSpmd(const DistMetaTensor& x, + float porder, + int axis, + float epsilon, + bool keepdims, + bool asvector) { + std::vector new_axis; + if (asvector) { + auto x_shape = common::vectorize(x.dims()); + int x_ndim = static_cast(x_shape.size()); + new_axis.resize(x_ndim); + for (int i = 0; i < x_ndim; ++i) { + new_axis[i] = i; + } + } else { + new_axis.push_back(axis); + } + VLOG(4) << "PNormInferSpmd Call ReductionInferSpmd"; + return ReductionInferSpmd(x, new_axis, keepdims); +} + +SpmdInfo PNormInferSpmdReverse(const DistMetaTensor& x, + const DistMetaTensor& out, + float porder, + int axis, + float epsilon, + bool keepdims, + bool asvector) { + std::vector new_axis; + if (asvector) { + auto x_shape = common::vectorize(x.dims()); + int x_ndim = static_cast(x_shape.size()); + new_axis.resize(x_ndim); + for (int i = 0; i < x_ndim; ++i) { + new_axis[i] = i; + } + } else { + new_axis.push_back(axis); + } + VLOG(4) << "PNormInferSpmdReverse Call ReductionInferSpmdReverse"; + return ReductionInferSpmdReverse(x, out, new_axis, keepdims); +} + +SpmdInfo PNormGradInferSpmd(const DistMetaTensor& x, + const DistMetaTensor& out, + const DistMetaTensor& out_grad, + float porder, + int axis, + float epsilon, + bool keepdims, + bool asvector) { + std::vector new_axis; + if (asvector) { + auto x_shape = common::vectorize(x.dims()); + int x_ndim = static_cast(x_shape.size()); + new_axis.resize(x_ndim); + for (int i = 0; i < x_ndim; ++i) { + new_axis[i] = i; + } + } else { + new_axis.push_back(axis); + } + VLOG(4) << "PNormGradInferSpmd Call ReductionGradInferSpmd"; + return ReductionGradInferSpmd(x, out, out_grad, new_axis, keepdims, asvector); +} + +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/p_norm.h b/paddle/phi/infermeta/spmd_rules/p_norm.h new file mode 100644 index 0000000000000..caeba5c6a9dc0 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/p_norm.h @@ -0,0 +1,50 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h" +#include "paddle/phi/core/distributed/type_defs.h" +#include "paddle/phi/infermeta/spmd_rules/reduction.h" + +namespace phi { +namespace distributed { + +SpmdInfo PNormInferSpmd(const DistMetaTensor& x, + float porder, + int axis, + float epsilon, + bool keepdims, + bool asvector); + +SpmdInfo PNormInferSpmdReverse(const DistMetaTensor& x, + const DistMetaTensor& out, + float porder, + int axis, + float epsilon, + bool keepdims, + bool asvector); + +SpmdInfo PNormGradInferSpmd(const DistMetaTensor& x, + const DistMetaTensor& out, + const DistMetaTensor& out_grad, + float porder, + int axis, + float epsilon, + bool keepdims, + bool asvector); +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc index 754e27a358112..99deb1728aa0f 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.cc +++ b/paddle/phi/infermeta/spmd_rules/rules.cc @@ -699,11 +699,17 @@ PD_REGISTER_SPMD_RULE(unbind, PD_INFER_SPMD(phi::distributed::UnbindInferSpmd), PD_INFER_SPMD(phi::distributed::UnbindInferSpmdReverse)); +// logsumexp PD_REGISTER_SPMD_RULE( logsumexp, PD_INFER_SPMD(phi::distributed::LogSumExpInferSpmd), PD_INFER_SPMD(phi::distributed::LogSumExpInferSpmdReverse)); +// p_norm +PD_REGISTER_SPMD_RULE(p_norm, + PD_INFER_SPMD(phi::distributed::PNormInferSpmd), + PD_INFER_SPMD(phi::distributed::PNormInferSpmdReverse)); + // pad PD_REGISTER_SPMD_RULE(pad, PD_INFER_SPMD(phi::distributed::PadInferSpmd), diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h index d0cbb90148fd5..697019f755fcb 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.h +++ b/paddle/phi/infermeta/spmd_rules/rules.h @@ -43,6 +43,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/spmd_rules/numel.h" #include "paddle/phi/infermeta/spmd_rules/one_hot.h" #include "paddle/phi/infermeta/spmd_rules/optimizer.h" +#include "paddle/phi/infermeta/spmd_rules/p_norm.h" #include "paddle/phi/infermeta/spmd_rules/pad.h" #include "paddle/phi/infermeta/spmd_rules/pow.h" #include "paddle/phi/infermeta/spmd_rules/reduction.h" diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml index f566aa15d0913..42d06f5f15d52 100644 --- a/paddle/phi/ops/yaml/backward.yaml +++ b/paddle/phi/ops/yaml/backward.yaml @@ -2183,6 +2183,7 @@ infer_meta : func : GeneralUnaryGradInferMeta param: [x] + spmd_rule : PNormGradInferSpmd kernel : func : p_norm_grad diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 916736670c39b..3139f76f10078 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -3558,6 +3558,7 @@ output : Tensor(out) infer_meta : func : PNormInferMeta + spmd_rule: PNormInferSpmd kernel : func : p_norm backward : p_norm_grad diff --git a/test/auto_parallel/spmd_rules/test_p_norm_rule.py b/test/auto_parallel/spmd_rules/test_p_norm_rule.py new file mode 100644 index 0000000000000..ee7b79dd2796a --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_p_norm_rule.py @@ -0,0 +1,333 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import OrderedDict + +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto +from paddle.framework import core + + +class TestPNormSPMDRule(unittest.TestCase): + """ + Unit tests for p_norm spmd rule. + """ + + def config(self): + self.kernel = "p_norm" + + def setUp(self): + self.config() + self.rule = core.get_phi_spmd_rule(self.kernel) + + x_shape = [64, 32] + process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3]) + + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [1, 0] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec) + + self.attrs = OrderedDict( + [ + ('porder', 2.0), + ('axis', 0), + ('epsilon', 1.0e-12), + ('keepdims', False), + ('asvector', False), + ] + ) + + def test_infer_forward(self): + # reduce on dim 0, keepdims = false, asvector = false + # [0, -1] --> [0, -1], [-1], partial_on_dim:[0] + self.attrs['axis'] = 0 + self.attrs['keepdims'] = False + self.attrs['asvector'] = False + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 1) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1]) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), True) + self.assertEqual(infered_output_dist_attrs[0]._partial_dims(), {0}) + + # reduce on dim 0, keepdims = true, asvector = false + # [0, -1] --> [0, -1], [-1, -1], partial_on_dim:[0] + + self.attrs['keepdims'] = True + self.attrs['axis'] = 0 + self.attrs['asvector'] = False + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), True) + self.assertEqual(infered_output_dist_attrs[0]._partial_dims(), {0}) + + # reduce on dim 1, keepdims = false, asvector = false + # [0, -1] --> [0, -1], [0], partial_on_dim:[] + self.attrs['keepdims'] = False + self.attrs['axis'] = 1 + self.attrs['asvector'] = False + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0]) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), False) + + # reduce on dim 1, keepdims = true, asvector = false + # [0, -1] --> [0, -1], [0, -1], partial_on_dim:[] + self.attrs['keepdims'] = True + self.attrs['axis'] = 1 + self.attrs['asvector'] = False + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), False) + + # reduce on dim 0 and 1, keepdims = false, asvector = true + # [0, -1] --> [0, -1], [], partial_on_dim:[0] + self.attrs['keepdims'] = False + self.attrs['axis'] = 0 + self.attrs['asvector'] = True + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, []) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), True) + self.assertEqual(infered_output_dist_attrs[0]._partial_dims(), {0}) + + # reduce on dim 0 and 1, keepdims = true, asvector = true + # [0, -1] --> [0, -1], [-1, -1], partial_on_dim:[0] + self.attrs['keepdims'] = True + self.attrs['axis'] = 0 + self.attrs['asvector'] = True + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), True) + self.assertEqual(infered_output_dist_attrs[0]._partial_dims(), {0}) + + def test_infer_backward(self): + # reduce on dim 0, keepdims = false, asvector = false + # [-1] --> [-1, -1], [-1] (output --> input, output) + self.attrs['keepdims'] = False + self.attrs['axis'] = 0 + self.attrs['asvector'] = False + self.out_dist_tensor_spec.shape = [32] + self.out_dist_tensor_spec.set_dims_mapping([-1]) + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 1) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1]) + + # reduce on dim 0, keepdims = true, asvector = false + # [-1, -1] --> [-1, -1], [-1, -1] (output --> input, output) + self.attrs['keepdims'] = True + self.attrs['axis'] = 0 + self.attrs['asvector'] = False + self.out_dist_tensor_spec.shape = [1, 32] + self.out_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1]) + + # reduce on dim 1, keepdims = false, asvector = false + # [0] --> [0, -1], [0] (output --> input, output) + self.attrs['keepdims'] = False + self.attrs['axis'] = 1 + self.attrs['asvector'] = False + self.out_dist_tensor_spec.shape = [64] + self.out_dist_tensor_spec.set_dims_mapping([0]) + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0]) + + # reduce on dim 1, keepdims = true, asvector = false + # [0, -1] --> [0, -1], [0, -1] (output --> input, output) + self.attrs['keepdims'] = True + self.attrs['axis'] = 1 + self.attrs['asvector'] = False + self.out_dist_tensor_spec.shape = [64, 1] + self.out_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1]) + + # reduce on dim 0 and 1, keepdims = false, asvector = true + # [] --> [-1, -1], [] (output --> input, output) + self.attrs['keepdims'] = False + self.attrs['axis'] = 0 + self.attrs['asvector'] = True + self.out_dist_tensor_spec.shape = [] + self.out_dist_tensor_spec.set_dims_mapping([]) + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, []) + + # reduce on dim 0 and 1, keepdims = true, asvector = true + # [-1, -1] --> [-1, -1], [-1, -1] (output --> input, output) + self.attrs['keepdims'] = True + self.attrs['axis'] = 0 + self.attrs['asvector'] = True + self.out_dist_tensor_spec.shape = [1, 1] + self.out_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_backward( + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['porder'], + self.attrs['axis'], + self.attrs['epsilon'], + self.attrs['keepdims'], + self.attrs['asvector'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1]) + + +if __name__ == "__main__": + unittest.main() From b33446723996db5e53628afff6bad5ef8677c9ed Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 10 Oct 2024 12:02:01 +0800 Subject: [PATCH 053/135] Migrate framework.proto to PHI [fluid_ops] (#68479) * Fix * ci --- cmake/inference_lib.cmake | 2 +- paddle/fluid/framework/CMakeLists.txt | 9 - paddle/fluid/framework/attribute.h | 2 +- paddle/fluid/framework/data_feed.h | 2 +- paddle/fluid/framework/data_type.h | 2 +- paddle/fluid/framework/device_worker.h | 2 +- paddle/fluid/framework/executor.cc | 2 +- .../fluid/framework/heter_pipeline_trainer.cc | 2 +- paddle/fluid/framework/heter_service.h | 2 +- paddle/fluid/framework/infershape_utils.cc | 2 +- paddle/fluid/framework/ir/generate_pass.h | 2 +- .../interpreter/interpreter_util.cc | 2 +- paddle/fluid/framework/op_def.proto | 2 +- paddle/fluid/framework/op_version_proto.h | 2 +- paddle/fluid/framework/phi_utils.h | 2 +- paddle/fluid/framework/pipeline_trainer.cc | 2 +- paddle/fluid/framework/prune.h | 2 +- paddle/fluid/framework/trainer.h | 2 +- paddle/fluid/framework/var_desc.h | 2 +- paddle/fluid/imperative/layout_transformer.h | 2 +- .../ir_params_sync_among_devices_pass.cc | 2 +- paddle/fluid/inference/tensorrt/helper.h | 2 +- .../tensorrt/plugin/custom_generic_plugin.cu | 2 +- .../tensorrt/plugin/generic_plugin.cu | 2 +- paddle/fluid/inference/utils/model_utils.cc | 2 +- .../ir_adaptor/translator/type_translator.cc | 2 +- .../pir/transforms/pd_op_to_kernel_pass.cc | 2 +- paddle/fluid/pybind/box_helper_py.cc | 2 +- paddle/fluid/pybind/data_set_py.cc | 2 +- paddle/fluid/pybind/fleet_wrapper_py.cc | 2 +- paddle/fluid/pybind/nccl_wrapper_py.cc | 2 +- paddle/phi/CMakeLists.txt | 9 + paddle/phi/core/CMakeLists.txt | 1 + paddle/phi/core/framework/CMakeLists.txt | 6 + paddle/phi/core/framework/data_feed.proto | 67 +++++ paddle/phi/core/framework/framework.proto | 270 ++++++++++++++++++ paddle/phi/core/framework/heter_service.proto | 70 +++++ paddle/phi/core/framework/pass_desc.proto | 91 ++++++ paddle/phi/core/framework/trainer_desc.proto | 220 ++++++++++++++ test/cpp/pir/core/add_dialect_parser_test.cc | 2 +- test/cpp/pir/core/ir_parser_test.cc | 2 +- test/cpp/pir/core/program_translator_test.cc | 2 +- .../cpp/pir/pass/transfer_layout_pass_test.cc | 2 +- 43 files changed, 768 insertions(+), 43 deletions(-) create mode 100644 paddle/phi/core/framework/CMakeLists.txt create mode 100644 paddle/phi/core/framework/data_feed.proto create mode 100644 paddle/phi/core/framework/framework.proto create mode 100644 paddle/phi/core/framework/heter_service.proto create mode 100644 paddle/phi/core/framework/pass_desc.proto create mode 100644 paddle/phi/core/framework/trainer_desc.proto diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 79937fc1aab02..247a7e5c87777 100755 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -304,7 +304,7 @@ endif() copy( inference_lib_dist - SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h + SRCS ${CMAKE_BINARY_DIR}/paddle/phi/core/framework/framework.pb.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal) copy( inference_lib_dist diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 4a63fb939ddf8..bf0c45ad4bf1c 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -82,10 +82,6 @@ add_subdirectory(fleet) add_subdirectory(io) add_subdirectory(new_executor) -#ddim lib -proto_library(framework_proto SRCS framework.proto) -proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto) - proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto) cc_library( op_def_api @@ -105,11 +101,6 @@ foreach(OP_DEF_FILE ${OP_DEF_FILES}) endforeach() file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}") -proto_library(heter_service_proto SRCS heter_service.proto) -proto_library(data_feed_proto SRCS data_feed.proto) -proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto - data_feed_proto) - cc_library( string_array SRCS string_array.cc diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index cf0ba09e7eb8e..b6dd0aa21aa56 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -24,10 +24,10 @@ limitations under the License. */ #include #include "paddle/common/errors.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/utils/any.h" #include "paddle/utils/variant.h" diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 5b6f40b6f794a..829a13a22aaf2 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -35,11 +35,11 @@ limitations under the License. */ #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/phi/core/framework/data_feed.pb.h" #include "paddle/phi/core/platform/timer.h" #include "paddle/utils/string/string_helper.h" #if defined(PADDLE_WITH_CUDA) diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index cb03e53c9787a..a40f33e2f3fbf 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" @@ -25,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" #include "paddle/phi/common/float8_e4m3fn.h" #include "paddle/phi/common/float8_e5m2.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/utils/test_macros.h" namespace paddle { diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 3d1dc9c08953c..5ffd6d3b10315 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -39,11 +39,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/trainer_desc.pb.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/phi/common/place.h" #include "paddle/phi/common/port.h" +#include "paddle/phi/core/framework/trainer_desc.pb.h" #include "paddle/phi/core/platform/timer.h" namespace paddle { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index eaf8545e56183..3977a3abc353b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -17,12 +17,12 @@ limitations under the License. */ #include #include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/trainer_desc.pb.h" #include "paddle/fluid/framework/trainer_factory.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/framework/trainer_desc.pb.h" #include "paddle/phi/core/platform/profiler.h" #ifdef PADDLE_WITH_DNNL #include "paddle/fluid/platform/onednn_helper.h" diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc index 904938e5b8e99..3aa3b5a3d1ad5 100644 --- a/paddle/fluid/framework/heter_pipeline_trainer.cc +++ b/paddle/fluid/framework/heter_pipeline_trainer.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" -#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/phi/core/framework/trainer_desc.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index 12a89102e4e04..8a121c3d15c48 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -23,9 +23,9 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/framework/heter_service.pb.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/phi/core/framework/heter_service.pb.h" #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS) #include "brpc/channel.h" #include "brpc/controller.h" diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index b873970f382d2..0490435d955b7 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/int_array.h" @@ -27,6 +26,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/paddle/fluid/framework/ir/generate_pass.h b/paddle/fluid/framework/ir/generate_pass.h index 9f1ff68c1850a..60a6690059e32 100644 --- a/paddle/fluid/framework/ir/generate_pass.h +++ b/paddle/fluid/framework/ir/generate_pass.h @@ -14,7 +14,7 @@ #pragma once #include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/pass_desc.pb.h" +#include "paddle/phi/core/framework/pass_desc.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 921aff862e7b7..089a8324f3c2c 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -20,7 +20,6 @@ #include "paddle/fluid/distributed/auto_parallel/dist_attr.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/io/save_load_tensor.h" #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h" @@ -36,6 +35,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h" #include "paddle/phi/core/distributed/comm_context_manager.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/memory/stats.h" diff --git a/paddle/fluid/framework/op_def.proto b/paddle/fluid/framework/op_def.proto index 51b5e13eafdd8..1480a6f9dc846 100644 --- a/paddle/fluid/framework/op_def.proto +++ b/paddle/fluid/framework/op_def.proto @@ -14,7 +14,7 @@ limitations under the License. */ syntax = "proto2"; -import "paddle/fluid/framework/framework.proto"; +import "paddle/phi/core/framework/framework.proto"; package paddle.framework.proto; message OpDef { diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h index 0de3bc82a94d2..64786af39cc26 100644 --- a/paddle/fluid/framework/op_version_proto.h +++ b/paddle/fluid/framework/op_version_proto.h @@ -19,7 +19,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/phi/core/framework/framework.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 243ee303d296f..5c0c5274653c6 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/common/macros.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/init_default_kernel_signature_map.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/operator.h" @@ -29,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/common/backend.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/compat/arg_map_context.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index aab0bc0a7ac95..ce4a38fe745ec 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" -#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/phi/core/framework/trainer_desc.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h index 57f282536bf46..2e68085a6b738 100644 --- a/paddle/fluid/framework/prune.h +++ b/paddle/fluid/framework/prune.h @@ -20,9 +20,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/framework/framework.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 451508bda2177..655636acc2ece 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -31,10 +31,10 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/trainer_desc.pb.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/phi/common/port.h" +#include "paddle/phi/core/framework/trainer_desc.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index dfd162ac1d149..49698a2b51112 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -22,8 +22,8 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/distributed/auto_parallel/dist_attr.h" #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/type_defs.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/utils/test_macros.h" namespace paddle { diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h index 349d2f5b5eb36..3fe2fa5751d2a 100644 --- a/paddle/fluid/imperative/layout_transformer.h +++ b/paddle/fluid/imperative/layout_transformer.h @@ -14,11 +14,11 @@ #pragma once #include "paddle/common/errors.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/imperative/layout_autotune.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/var_helper.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/tensor_utils.h" namespace paddle { namespace imperative { diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 9f33a829d2821..939c0692ccb1e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -19,13 +19,13 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/framework/framework.pb.h" PD_DEFINE_bool( // NOLINT custom_model_save_cpu, diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index bff9605842949..6b8292d73d94b 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -22,10 +22,10 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/utils/data_type.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu index 59b9f7f165ab9..c6273921abb9a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu @@ -14,7 +14,6 @@ #include "paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.h" #include "paddle/common/enforce.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" @@ -22,6 +21,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu index 5200f583c2395..bf192957cf9f0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu @@ -14,7 +14,6 @@ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h" @@ -22,6 +21,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/kernels/funcs/data_type_transform.h" diff --git a/paddle/fluid/inference/utils/model_utils.cc b/paddle/fluid/inference/utils/model_utils.cc index c4cab12b21a25..187fdaa316026 100644 --- a/paddle/fluid/inference/utils/model_utils.cc +++ b/paddle/fluid/inference/utils/model_utils.cc @@ -14,9 +14,9 @@ #include "paddle/fluid/inference/utils/model_utils.h" #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/framework/framework.pb.h" namespace paddle::inference { diff --git a/paddle/fluid/ir_adaptor/translator/type_translator.cc b/paddle/fluid/ir_adaptor/translator/type_translator.cc index d46ce502f70d4..089d795cd1385 100644 --- a/paddle/fluid/ir_adaptor/translator/type_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/type_translator.cc @@ -14,9 +14,9 @@ #include "paddle/fluid/ir_adaptor/translator/type_translator.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/type_storage.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/pir/include/core/builtin_type.h" namespace paddle::translator { diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index f9ffa5c4b9f0a..244dcedd9bb5a 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -53,10 +53,10 @@ #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h" #ifdef PADDLE_WITH_DNNL -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h" #include "paddle/fluid/pir/dialect/operator/trait/onednn.h" +#include "paddle/phi/core/framework/framework.pb.h" COMMON_DECLARE_bool(use_mkldnn); #endif diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc index 227c9dedba1f8..e49e0703ee6c6 100644 --- a/paddle/fluid/pybind/box_helper_py.cc +++ b/paddle/fluid/pybind/box_helper_py.cc @@ -26,9 +26,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_feed.h" -#include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/fleet/box_wrapper.h" #include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/phi/core/framework/data_feed.pb.h" #ifdef PADDLE_WITH_BOX_PS #include #endif diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index c12b778dec35b..3635c9bb70d69 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -28,12 +28,12 @@ limitations under the License. */ #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/data_feed.h" -#include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/dataset_factory.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/framework/data_feed.pb.h" #include "paddle/fluid/pybind/data_set_py.h" diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index e6fbc29ace403..95f50705da41a 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -27,11 +27,11 @@ limitations under the License. */ #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/data_feed.h" -#include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/framework/data_feed.pb.h" #include "paddle/fluid/pybind/fleet_wrapper_py.h" diff --git a/paddle/fluid/pybind/nccl_wrapper_py.cc b/paddle/fluid/pybind/nccl_wrapper_py.cc index e5925ceaf19af..efc9704aea174 100644 --- a/paddle/fluid/pybind/nccl_wrapper_py.cc +++ b/paddle/fluid/pybind/nccl_wrapper_py.cc @@ -27,11 +27,11 @@ limitations under the License. */ #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/text_format.h" #include "paddle/fluid/framework/data_feed.h" -#include "paddle/fluid/framework/data_feed.pb.h" #include "paddle/fluid/framework/fleet/nccl_wrapper.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/framework/data_feed.pb.h" #include "paddle/fluid/pybind/nccl_wrapper_py.h" diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 4a15534ea7921..087b38d336b53 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -52,6 +52,15 @@ set(PHI_DEPS utf8proc common) +list( + APPEND + PHI_DEPS + framework_proto + pass_desc_proto + data_feed_proto + trainer_desc_proto + heter_service_proto) + set(INFERENCE_DEPS phi_profiler_proto auto_parallel_proto) if(WITH_GPU) diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index bfb34cc6dfd9e..5e8ff5d5fc2ef 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(compat) add_subdirectory(distributed) add_subdirectory(memory) add_subdirectory(platform) +add_subdirectory(framework) if(WITH_GPU) proto_library(external_error_proto SRCS external_error.proto) diff --git a/paddle/phi/core/framework/CMakeLists.txt b/paddle/phi/core/framework/CMakeLists.txt new file mode 100644 index 0000000000000..dc025f19903c2 --- /dev/null +++ b/paddle/phi/core/framework/CMakeLists.txt @@ -0,0 +1,6 @@ +proto_library(framework_proto SRCS framework.proto) +proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto) +proto_library(data_feed_proto SRCS data_feed.proto) +proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto + data_feed_proto) +proto_library(heter_service_proto SRCS heter_service.proto) diff --git a/paddle/phi/core/framework/data_feed.proto b/paddle/phi/core/framework/data_feed.proto new file mode 100644 index 0000000000000..f11c9c20632a6 --- /dev/null +++ b/paddle/phi/core/framework/data_feed.proto @@ -0,0 +1,67 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.framework; + +message Slot { + required string name = 1; + required string type = 2; + optional bool is_dense = 3 [ default = false ]; + optional bool is_used = 4 [ default = false ]; + repeated int32 shape = 5; // we can define N-D phi::DenseTensor +} + +message MultiSlotDesc { + repeated Slot slots = 1; + optional string uid_slot = 2; +} + +message GraphConfig { + optional int32 walk_degree = 1 [ default = 1 ]; + optional int32 walk_len = 2 [ default = 20 ]; + optional int32 window = 3 [ default = 5 ]; + optional int32 once_sample_startid_len = 4 [ default = 8000 ]; + optional int32 sample_times_one_chunk = 5 [ default = 10 ]; + optional int32 batch_size = 6 [ default = 1 ]; + optional int32 debug_mode = 7 [ default = 0 ]; + optional string first_node_type = 8; + optional string meta_path = 9; + optional bool gpu_graph_training = 10 [ default = true ]; + optional bool sage_mode = 11 [ default = false ]; + optional string samples = 12; + optional int64 train_table_cap = 13 [ default = 80000 ]; + optional int64 infer_table_cap = 14 [ default = 80000 ]; + optional string excluded_train_pair = 15; + optional string infer_node_type = 16; + optional bool get_degree = 17 [ default = false ]; + optional bool weighted_sample = 18 [ default = false ]; + optional bool return_weight = 19 [ default = false ]; + optional string pair_label = 20; + optional bool is_thread_sharding = 21 [ default = false ]; + optional int32 accumulate_num = 22 [ default = 1 ]; +} + +message DataFeedDesc { + optional string name = 1; + optional int32 batch_size = 2 [ default = 32 ]; + optional MultiSlotDesc multi_slot_desc = 3; + optional string pipe_command = 4; + optional int32 thread_num = 5; + optional string rank_offset = 6; + optional int32 pv_batch_size = 7 [ default = 32 ]; + optional int32 input_type = 8 [ default = 0 ]; + optional string so_parser_name = 9; + optional GraphConfig graph_config = 10; +} diff --git a/paddle/phi/core/framework/framework.proto b/paddle/phi/core/framework/framework.proto new file mode 100644 index 0000000000000..4633cc68b70ad --- /dev/null +++ b/paddle/phi/core/framework/framework.proto @@ -0,0 +1,270 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.framework.proto; + +// Any incompatible changes to ProgramDesc and its dependencies should +// raise the version defined version.h. +// +// Serialization and Deserialization codes should be modified in a way +// that supports old versions following the version and compatibility policy. +message Version { optional int64 version = 1 [ default = 0 ]; } + +enum AttrType { + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; + BOOLEAN = 6; + BOOLEANS = 7; + BLOCK = 8; + LONG = 9; + BLOCKS = 10; + LONGS = 11; + FLOAT64S = 12; + VAR = 13; + VARS = 14; + FLOAT64 = 15; + SCALAR = 16; + SCALARS = 17; +} + + +message Complex { + required double r = 1; + required double i = 2; +}; + +message Scalar { + enum Type { + BOOLEAN = 1; + LONG = 2; + FLOAT64 = 3; + COMPLEX128 = 4; + } + required Type type = 1; + + optional bool b = 2; + optional int64 i = 3; + optional double r = 4; + optional Complex c = 5; +}; + +// OpDesc describes an instance of a C++ framework::OperatorBase +// derived class type. +message OpDesc { + + message Attr { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; + optional bool b = 10; + repeated bool bools = 11; + optional int32 block_idx = 12; + optional int64 l = 13; + repeated int32 blocks_idx = 14; + repeated int64 longs = 15; + repeated double float64s = 16; + optional string var_name = 17; + repeated string vars_name = 18; + optional double float64 = 19; + optional Scalar scalar = 20; + repeated Scalar scalars = 21; + }; + + message Var { + required string parameter = 1; + repeated string arguments = 2; + }; + + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; + +// OpProto describes a C++ framework::OperatorBase derived class. +message OpProto { + + // VarProto describes the C++ type framework::Variable. + message Var { + required string name = 1; + required string comment = 2; + + optional bool duplicable = 3 [ default = false ]; + optional bool intermediate = 4 [ default = false ]; + optional bool dispensable = 5 [ default = false ]; + optional bool extra = 6 [ default = false ]; + optional bool quant = 7 [ default = false ]; + } + + // AttrProto describes the C++ type Attribute. + message Attr { + required string name = 1; + required AttrType type = 2; + required string comment = 3; + // If that attribute is generated, it means the Paddle third + // language binding has responsibility to fill that + // attribute. End-User should not set that attribute. + optional bool generated = 4 [ default = false ]; + optional bool extra = 5 [ default = false ]; + optional bool quant = 6 [ default = false ]; + optional bool support_tensor = 7 [ default = false]; + } + + required string type = 1; + repeated Var inputs = 2; + repeated Var outputs = 3; + repeated Attr attrs = 4; + required string comment = 5; +} + +message VarType { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + // phi::DenseTensor is used in C++. + SIZE_T = 19; + UINT8 = 20; + INT8 = 21; + BF16 = 22; + COMPLEX64 = 23; + COMPLEX128 = 24; + FP8_E4M3FN = 32; + FP8_E5M2 = 33; + // Other types that may need additional descriptions + LOD_TENSOR = 7; + SELECTED_ROWS = 8; + FEED_MINIBATCH = 9; + FETCH_LIST = 10; + STEP_SCOPES = 11; + LOD_RANK_TABLE = 12; + LOD_TENSOR_ARRAY = 13; + PLACE_LIST = 14; + READER = 15; + // Any runtime decided variable type is raw + // raw variables should manage their own allocations + // in operators like nccl_op + RAW = 17; + TUPLE = 18; + + STRING = 25; + STRINGS = 26; + VOCAB = 27; + FEED_LIST = 28; + // The data type of phi::StringTensor + PSTRING = 29; + // the data type of phi::SparseCooTensor + SPARSE_COO = 30; + // the data type of phi::SparseCsrTensor + SPARSE_CSR = 31; + } + + required Type type = 1; + + message TensorDesc { + // Should only be PODType. Is enforced in C++ + required Type data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] + } + optional TensorDesc selected_rows = 2; + + message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; + } + optional LoDTensorDesc lod_tensor = 3; + + message LoDTensorArrayDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; + } + optional LoDTensorArrayDesc tensor_array = 4; + + message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } + optional ReaderDesc reader = 5; + + message Tuple { repeated Type element_type = 1; } + optional Tuple tuple = 7; + + optional TensorDesc string = 8; + optional TensorDesc strings = 9; + optional TensorDesc vocab = 10; + optional TensorDesc sparse_coo = 11; + optional TensorDesc sparse_csr = 12; +} + +message VarDesc { + + message Attr { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional string s = 4; + repeated int32 ints = 5; + }; + + required string name = 1; + required VarType type = 2; + optional bool persistable = 3 [ default = false ]; + // True if the variable is an input data and + // have to check the feed data shape and dtype + optional bool need_check_feed = 4 [ default = false ]; + optional bool is_parameter = 5 [ default = false ]; + optional bool stop_gradient = 6 [ default = false ]; + repeated Attr attrs = 7; +} + +message BlockDesc { + required int32 idx = 1; + required int32 parent_idx = 2; + repeated VarDesc vars = 3; + repeated OpDesc ops = 4; + optional int32 forward_block_idx = 5 [ default = -1 ]; +} + +// In some cases, Paddle may perform operator definition iterations, +// and the operator uses OpVersionMap for compatibility testing. +message OpVersion { required int32 version = 1; } +message OpVersionMap { + message OpVersionPair { + required string op_name = 1; + required OpVersion op_version = 2; + } + repeated OpVersionPair pair = 1; +} + +// TODO(panyx0718): A model can have multiple programs. Need a +// way to distinguish them. Maybe ID or name? +message ProgramDesc { + reserved 2, 3; // For backward compatibility. + repeated BlockDesc blocks = 1; + optional Version version = 4; + optional OpVersionMap op_version_map = 5; +} diff --git a/paddle/phi/core/framework/heter_service.proto b/paddle/phi/core/framework/heter_service.proto new file mode 100644 index 0000000000000..184c3e6aae93c --- /dev/null +++ b/paddle/phi/core/framework/heter_service.proto @@ -0,0 +1,70 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.framework; +option cc_generic_services = true; + +// It can be: phi::DenseTensor、SelectedRows or NCCL_ID +enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + NCCL_ID = 2; +} + +// VariableMessage is serialized paddle variable message. +// NOTICE(gongwb):don't modify this proto if you are not +// not familiar with how we serialize in sendrecvop_utils.h +// and deserialize it in variable_response.h. +message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + optional string varname = 1; + // TODO(Yancey1989): reference framework::proto::VarDesc::VarType + optional VarType type = 2; + // bool persistable is not needed for sending. + // tensor info: + optional Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + optional int64 lod_level = 5; + repeated LodData lod = 6; + // selected_rows height, aka. original dim0 + optional int64 slr_height = 7; + // tensor data + optional bytes data = 8; +} +message HeterRequest { + required int32 cmd = 1; + optional int32 cur_batch = 2; + repeated VariableMessage vars = 3; +}; + +message HeterResponse { + // optional VariableMessage vars = 1; + repeated VariableMessage vars = 1; +}; + +service HeterService { rpc service(HeterRequest) returns (HeterResponse); }; diff --git a/paddle/phi/core/framework/pass_desc.proto b/paddle/phi/core/framework/pass_desc.proto new file mode 100644 index 0000000000000..66dd6c02f1d90 --- /dev/null +++ b/paddle/phi/core/framework/pass_desc.proto @@ -0,0 +1,91 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +import "paddle/phi/core/framework/framework.proto"; +package paddle.framework.proto; + +// Describes one subsitute subgraph. +message PassDesc { + enum RoleType { + kVariable = 0; + kOperator = 1; + } + enum OperationType { + kAdd = 0; + kSub = 1; + kMul = 2; + kDiv = 3; + kSize = 4; + kMod = 5; + } + enum ConditionType { + kEQ = 0; + kNE = 1; + kGT = 2; + kGE = 3; + kLT = 4; + kLE = 5; + } + // Representation of attr in var or operator. + message Attr { + required RoleType role = 1; + optional string var_name = 2; + optional int32 op_index = 3; + required string name = 4; + optional string element_name = 5; + optional int32 element_index = 6; + optional OperationType operation = 7; + } + // The operation to be performed. + message Operation { + required OperationType type = 1; + optional Attr attr = 2; + optional OpDesc.Attr value = 3; + } + message VarMap { + required string pattern_var = 1; + required string replace_var = 2; + } + message AttrMap { + required Attr pattern_attr = 1; + required Attr replace_attr = 2; + optional Operation operation = 3; + } + message AttrCondition { + required Attr attr = 1; + required ConditionType type = 2; + optional Attr condition_attr = 3; + optional OpDesc.Attr condition_value = 4; + optional Operation operation = 5; + } + // A pair of subgraphs for matching and rewriting. + repeated OpDesc pattern = 1; + repeated OpDesc replace = 2; + // Mapping vars between pattern and replace subgraphs. + repeated VarMap var_maps = 3; + // Mapping attrs of vars and ops between pattern and replace subgraphs. + repeated AttrMap var_attr_maps = 4; + repeated AttrMap op_attr_maps = 5; + // Limit the attrs of vars and ops in pattern subgraph. + repeated AttrCondition var_attr_conditions = 6; + repeated AttrCondition op_attr_conditions = 7; +} + +// A series of PassDesc. +message MultiPassDesc { + optional string pass_type = 1; + repeated PassDesc pass_descs = 2; +} diff --git a/paddle/phi/core/framework/trainer_desc.proto b/paddle/phi/core/framework/trainer_desc.proto new file mode 100644 index 0000000000000..ee40307c523c7 --- /dev/null +++ b/paddle/phi/core/framework/trainer_desc.proto @@ -0,0 +1,220 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +option optimize_for = LITE_RUNTIME; +import "paddle/phi/core/framework/data_feed.proto"; +import "paddle/phi/core/framework/framework.proto"; +package paddle.framework; + +message TrainerDesc { + // class name for create trainer desc + // the matchness of trainer name and device worker name + // will be checked in python API + optional string class_name = 1; + // class name for creating device worker + optional string device_worker_name = 2; + // thread number + optional int32 thread_num = 3; + // if we need to binding cpu + optional bool binding_cpu = 4 [ default = false ]; + repeated string filelist = 5; + optional bool debug = 6 [ default = false ]; + optional FetchConfig fetch_config = 7; + optional bool use_cvm = 8 [ default = false ]; + optional bool dump_slot = 9 [ default = false ]; + optional float scale_datanorm = 10 [ default = -1 ]; + optional int32 mpi_rank = 11 [ default = -1 ]; + optional string dump_fields_path = 12; + repeated string dump_fields = 13; + optional string dump_converter = 14; + repeated string dump_param = 15; + optional int32 mpi_size = 16 [ default = -1 ]; + optional int32 dump_file_num = 17 [ default = 16 ]; + repeated string check_nan_var_names = 18; + optional CopyTableConfig copy_table_config = 19; + // adjust ins weight + optional AdjustInsWeightConfig adjust_ins_weight_config = 20; + optional bool no_cvm = 21 [ default = false ]; + optional bool thread_barrier = 22; + repeated string loss_names = 23; + optional bool enable_random_dump = 24 [ default = false ]; + optional bool random_with_lineid = 25 [ default = false ]; + optional int32 dump_interval = 26 [ default = 10000 ]; + repeated int32 worker_places = 27; + + repeated string xpu_send_list = 28; + repeated string xpu_recv_list = 29; + optional int32 xpu_start_idx = 30; + optional int32 xpu_end_idx = 31; + + optional bool use_ps_gpu = 32 [ default = false ]; + optional string user_define_dump_filename = 33; + optional bool scale_sparse_gradient_with_batch_size = 34 [ default = true ]; + + repeated int32 trainers = 35; + optional int32 trainer_id = 36; + + // add for gpu + optional string fleet_desc = 37; + optional bool is_dump_in_simple_mode = 38 [ default = false ]; + optional string dump_fields_mode = 39 [ default = "w" ]; + optional int32 dump_num_decimals = 40 [ default = 9 ]; + optional bool use_gpu_graph = 41 [ default = false ]; + // device worker parameters + optional HogwildWorkerParameter hogwild_param = 101; + optional DownpourWorkerParameter downpour_param = 103; + optional PullDenseWorkerParameter pull_dense_param = 102; + optional SectionWorkerParameter section_param = 104; + optional HeterSectionWorkerParameter heter_section_param = 105; + // datafeed desc + optional DataFeedDesc data_desc = 201; +} + +message HogwildWorkerParameter { + repeated string skip_ops = 1; + repeated string stat_var_names = 2; +} + +message DownpourWorkerParameter { + repeated TableParameter sparse_table = 1; + repeated TableParameter dense_table = 2; + repeated string skip_ops = 3; + repeated ProgramConfig program_config = 4; + optional bool push_sparse = 5 [ default = true ]; + optional bool push_dense = 6 [ default = true ]; + repeated string stat_var_names = 7; +} + +message SectionWorkerParameter { + optional SectionConfig section_config = 1; + optional int32 queue_size = 2 [ default = 1 ]; + optional int64 sync_steps = 3 [ default = 1 ]; + optional int32 start_cpu_core_id = 4 [ default = 1 ]; + repeated string param_need_sync = 5; + optional int32 num_microbatches = 6; + optional int32 num_pipeline_stages = 7 [ default = 1 ]; + optional int32 pipeline_stage = 8 [ default = 1 ]; + optional int32 schedule_mode = 9 [ default = 0 ]; +} + +message HeterSectionWorkerParameter { + optional SectionConfig section_config = 1; + optional int32 queue_size = 2 [ default = 1 ]; + optional int64 sync_steps = 3 [ default = 1 ]; + optional int32 start_cpu_core_id = 4 [ default = 1 ]; + repeated string param_need_sync = 5; + optional int32 num_microbatches = 6; + optional int32 num_pipeline_stages = 7 [ default = 1 ]; + optional int32 pipeline_stage = 8 [ default = 1 ]; +} + +message SectionConfig { + enum Place { + CPUPlace = 0; + CUDAPlace = 1; + CUDAPinnedPlace = 2; + } + + // FIXME: How to use proto::ProgramDesc + // required string program_desc_str = 1; + optional proto.ProgramDesc program_desc = 1; + optional Place place = 2; + optional int32 concurrency = 3 [ default = 1 ]; + repeated string section_in_var_names = 4; + repeated string section_out_var_names = 5; + optional int32 place_id = 6 [ default = -1 ]; +} + +message FetchConfig { + enum Method { PRINT = 0; } + repeated string fetch_var_names = 1; + repeated string fetch_var_str_format = 2; + optional int32 print_period = 3 [ default = 100 ]; + optional Method method = 4 [ default = PRINT ]; +} + +message AdjustInsWeightConfig { + optional bool need_adjust = 1 [ default = false ]; + optional string nid_slot = 2 [ default = "" ]; + optional float nid_adjw_threshold = 3 [ default = 0.0 ]; + optional float nid_adjw_ratio = 4 [ default = 0.0 ]; + optional string ins_weight_slot = 5 [ default = "" ]; +} + +message TableDependencyMap { + required int32 key = 1; + repeated int32 values = 2; +} + +message CopyTableConfig { + optional bool need_copy = 1 [ default = false ]; + optional int32 batch_num = 2 [ default = 100 ]; + repeated int32 src_sparse_tables = 3; + repeated int32 dest_sparse_tables = 4; + repeated int32 src_dense_tables = 5; + repeated int32 dest_dense_tables = 6; + repeated string src_var_list = 7; + repeated string dest_var_list = 8; + // when dest dense table has no grad, should pull explicitly + optional bool dense_pull_after_copy = 9 [ default = false ]; + // copy feasigns or copy the whole table + optional bool sparse_copy_by_feasign = 10 [ default = true ]; + // table dependency for pull/push + optional bool enable_dependency = 11 [ default = false ]; + repeated TableDependencyMap table_dependency_map = 12; +} + +message CondTableMap { + required int32 key = 1; + required int32 value = 2; +} +message ProgramConfig { + required string program_id = 1; + repeated int32 push_sparse_table_id = 2; + repeated int32 push_dense_table_id = 3; + repeated int32 pull_sparse_table_id = 4; + repeated int32 pull_dense_table_id = 5; + repeated CondTableMap partial_pushdense_condtable_map = 10; +} + +message PullDenseWorkerParameter { + // dense table only and specialized usage + optional int32 threshold = 1 [ default = 1 ]; + optional int32 device_num = 2; + optional int32 sleep_time_ms = 3 [ default = 2 ]; + repeated TableParameter dense_table = 4; +} + +message TableParameter { + // dense table only + optional uint64 table_id = 1; + repeated string dense_value_name = 2; + repeated string dense_grad_name = 3; + repeated int32 push_dense_wait_times = 5; + // sparse table only + repeated string sparse_key_name = 6; + repeated string sparse_value_name = 7; + repeated string sparse_grad_name = 8; + repeated int32 push_sparse_wait_times = 9; + // sparse table only and specialized usage + optional int32 emb_dim = 10; + optional int32 fea_dim = 11; + optional string label_var_name = 12; + // if table will pull sparse to local first + optional bool is_local = 13 [ default = false ]; + // if table will pull sparse asynchronously in worker + optional bool is_async = 14 [ default = false ]; + optional string async_wait_op_name = 15; +} diff --git a/test/cpp/pir/core/add_dialect_parser_test.cc b/test/cpp/pir/core/add_dialect_parser_test.cc index 602571743d773..257ac906c4d0f 100644 --- a/test/cpp/pir/core/add_dialect_parser_test.cc +++ b/test/cpp/pir/core/add_dialect_parser_test.cc @@ -14,10 +14,10 @@ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/pir/include/core/attribute.h" #include "paddle/pir/include/core/attribute_base.h" #include "paddle/pir/include/core/builtin_attribute.h" diff --git a/test/cpp/pir/core/ir_parser_test.cc b/test/cpp/pir/core/ir_parser_test.cc index dbbf7d76b2766..c4df852a8e4cf 100644 --- a/test/cpp/pir/core/ir_parser_test.cc +++ b/test/cpp/pir/core/ir_parser_test.cc @@ -17,10 +17,10 @@ #include "gtest/gtest.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/pir/include/core/attribute.h" #include "paddle/pir/include/core/attribute_base.h" #include "paddle/pir/include/core/builtin_attribute.h" diff --git a/test/cpp/pir/core/program_translator_test.cc b/test/cpp/pir/core/program_translator_test.cc index 9071defaaac19..2c0a1c6cf682f 100644 --- a/test/cpp/pir/core/program_translator_test.cc +++ b/test/cpp/pir/core/program_translator_test.cc @@ -22,7 +22,6 @@ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" @@ -31,6 +30,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/pir/include/core/builtin_dialect.h" #include "paddle/pir/include/core/dialect.h" #include "paddle/pir/include/core/ir_context.h" diff --git a/test/cpp/pir/pass/transfer_layout_pass_test.cc b/test/cpp/pir/pass/transfer_layout_pass_test.cc index 26f7649d161b1..80af3d844ed7a 100644 --- a/test/cpp/pir/pass/transfer_layout_pass_test.cc +++ b/test/cpp/pir/pass/transfer_layout_pass_test.cc @@ -29,7 +29,6 @@ #include "paddle/common/layout.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h" @@ -40,6 +39,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/transforms/general/transfer_layout_pass.h" #include "paddle/fluid/pir/transforms/passes.h" +#include "paddle/phi/core/framework/framework.pb.h" #include "paddle/pir/include/core/builtin_dialect.h" #include "paddle/pir/include/core/ir_context.h" #include "paddle/pir/include/core/program.h" From 8812d38be931dd2be63750edf9bebd648fb79bbe Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 10 Oct 2024 12:03:41 +0800 Subject: [PATCH 054/135] Clean collective_helper in cmake [fluid_ops] (#68567) * Fix * ci --- paddle/fluid/distributed/collective/CMakeLists.txt | 11 +++++------ .../fluid/distributed/fleet_executor/CMakeLists.txt | 1 - paddle/fluid/framework/CMakeLists.txt | 2 -- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- paddle/fluid/imperative/CMakeLists.txt | 13 ++++++------- paddle/fluid/operators/collective/CMakeLists.txt | 5 ++--- paddle/fluid/platform/CMakeLists.txt | 5 ----- 7 files changed, 14 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index a8cc37456e01d..2ec6f64e1100a 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -19,13 +19,12 @@ if(WITH_NCCL OR WITH_RCCL) cc_library( process_group_nccl SRCS process_group_nccl.cc common.cc - DEPS process_group phi common collective_helper device_context - ${DEVICE_EVENT_LIBS}) + DEPS process_group phi ${DEVICE_EVENT_LIBS}) cc_library( async_load SRCS async_load.cc - DEPS device_context phi common ${DEVICE_EVENT_LIBS}) + DEPS phi ${DEVICE_EVENT_LIBS}) endif() @@ -33,21 +32,21 @@ if(WITH_XPU_BKCL) cc_library( process_group_bkcl SRCS process_group_bkcl.cc bkcl_tools.cc common.cc - DEPS process_group phi common collective_helper device_context) + DEPS process_group phi) endif() if(WITH_MPI) cc_library( process_group_mpi SRCS process_group_mpi.cc mpi_tools.cc common.cc - DEPS collective_helper device_context) + DEPS phi) endif() if(WITH_CUSTOM_DEVICE) cc_library( process_group_custom SRCS process_group_custom.cc custom_ccl_tools.cc common.cc - DEPS process_group phi common collective_helper device_context) + DEPS process_group phi) endif() set(COMM_UTILS_DEPS process_group) diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index 27ae6cb0702fd..6d516af91b652 100755 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -38,7 +38,6 @@ cc_library( fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool - collective_helper executor_gc_helper op_registry phi diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index bf0c45ad4bf1c..516b80506b40e 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -565,7 +565,6 @@ if(WITH_DISTRIBUTE) data_set.cc DEPS fleet_wrapper op_registry - device_context scope framework_proto trainer_desc_proto @@ -579,7 +578,6 @@ if(WITH_DISTRIBUTE) lod_rank_table feed_fetch_method feed_hook - collective_helper ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d27564c7de1e5..74911f601b3bf 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -19,7 +19,7 @@ cc_library( cc_library( graph_helper SRCS graph_helper.cc - DEPS graph program_utils collective_helper) # + DEPS graph program_utils phi) # cc_library( pass SRCS pass.cc diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 746dbe4163d49..2758d08162b95 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -93,12 +93,11 @@ if(NOT WIN32) cc_library( imperative_all_reduce SRCS all_reduce.cc - DEPS collective_helper device_context selected_rows_utils tensor) + DEPS phi selected_rows_utils tensor) cc_library( nccl_context SRCS nccl_context.cc - DEPS collective_helper device_context imperative_all_reduce - var_type_traits) + DEPS phi imperative_all_reduce var_type_traits) if(WITH_NCCL) nv_library( reducer @@ -116,7 +115,7 @@ if(NOT WIN32) cc_library( bkcl_context SRCS bkcl_context.cc - DEPS collective_helper device_context tensor var_type_traits) + DEPS phi tensor var_type_traits) cc_library( reducer SRCS reducer.cc @@ -126,7 +125,7 @@ if(NOT WIN32) cc_library( xccl_context SRCS xccl_context.cc - DEPS collective_helper device_context tensor var_type_traits) + DEPS phi tensor var_type_traits) if(NOT (WITH_NCCL OR WITH_RCCL @@ -145,7 +144,7 @@ if(NOT WIN32) cc_library( heter_ccl_context SRCS heter_ccl_context.cc - DEPS collective_helper device_context tensor var_type_traits) + DEPS phi tensor var_type_traits) endif() cc_library( data_loader @@ -156,7 +155,7 @@ if(WITH_GLOO) cc_library( imperative_gloo_context SRCS gloo_context.cc - DEPS collective_helper device_context tensor var_type_traits) + DEPS phi tensor var_type_traits) if(WIN32 OR (NOT (WITH_NCCL diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 1c8c8f00217cc..0d8b35d1d6249 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -31,14 +31,13 @@ register_operators( ${COLLECTIVE_DEPS}) if(WITH_NCCL OR WITH_RCCL) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi - common) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common phi) op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() if(WITH_XPU_BKCL) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} phi) op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) endif() diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 985558c6c7b4c..32a8c69cf6d0d 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -67,11 +67,6 @@ cc_library( phi common) -cc_library( - collective_helper - SRCS collective_helper.cc - DEPS framework_proto device_context phi common) - # Manage all device event library set(DEVICE_EVENT_LIBS) From f1d7fbaa88838c90482b44d7a35f42534a1c5b09 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:35:51 +0800 Subject: [PATCH 055/135] [Paddle TensorRT] add pd_op.bilinear,pd_op.nearest converter (#68529) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * split converter * 忘了提交一个单侧 * 记录代码 * add concat converter * delete unittest * rerun ci * fix bugs * bilinear+nearest * 注释 * 修改review * 忘了加单测超时 * 修改单测基类 --------- Co-authored-by: YuanRisheng --- .../transforms/tensorrt/trt_op_marker_pass.cc | 183 ++++++++++++ python/paddle/tensorrt/converter.py | 1 + python/paddle/tensorrt/impls/common.py | 248 ++++++++++++++++ test/tensorrt/CMakeLists.txt | 1 + test/tensorrt/tensorrt_test_base.py | 92 ++++-- test/tensorrt/test_converter_common.py | 264 ++++++++++++++++++ 6 files changed, 759 insertions(+), 30 deletions(-) create mode 100644 python/paddle/tensorrt/impls/common.py create mode 100644 test/tensorrt/test_converter_common.py diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index 04cd8021187c6..e8890d6156deb 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -1279,6 +1279,187 @@ class MeanOpPattern : public pir::OpRewritePattern { } }; +class BilinearInterpV2Pattern + : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern< + paddle::dialect::BilinearInterpOp>::OpRewritePattern; + bool MatchAndRewrite(paddle::dialect::BilinearInterpOp op, + pir::PatternRewriter &rewriter) const override { + if (op->HasAttribute(kCanRunTrtAttr) && + op->attribute(kCanRunTrtAttr).data()) { + return false; + } + const std::vector required_attrs = {"data_format", + "interp_method", + "align_corners", + "scale", + "out_h", + "out_w"}; + for (const auto &attr : required_attrs) { + if (!op->HasAttribute(attr)) { + VLOG(3) << "BilinearInterpV2 " << attr << " attribute does not exist"; + return false; + } + } + pir::Value size_tensor = op.operand_source(2); + if (size_tensor != nullptr) { + VLOG(3) << "The Paddle-TRT doesn't support the SizeTensor for " + "BilinearInterpV2"; + return false; + } + + auto data_format = + op->attribute("data_format").AsString(); + if (data_format != "NCHW" && data_format != "NHWC") { + VLOG(3) << "BilinearInterpV2: data format must be NCHW or NHWC"; + return false; + } + auto interp_method = + op->attribute("interp_method").AsString(); + if (interp_method != "bilinear") { + VLOG(3) << "The interp_method of BilinearInterpV2 is not bilinear"; + return false; + } + + pir::Value scale_tensor = op.operand_source(3); + + bool has_scale_input = false; + if (scale_tensor) { + has_scale_input = true; + } + + if (has_scale_input) { + VLOG(3) << "BilinearInterpV2 has scale input can not into trt,support " + "scale attribute into trt"; + return false; + } + if (!has_scale_input && op->HasAttribute("scale")) { + std::vector scale; + auto scale_attr = op->attribute("scale"); + for (const auto &attr : scale_attr.AsVector()) { + scale.push_back(attr.dyn_cast().data()); + } + if (scale.size() <= 1) { + if (!op->HasAttribute("out_h") || !op->HasAttribute("out_w")) { + VLOG(3) << "BilinearInterpV2 doesn't have scale_tensor and the scale " + "size <=1 and without" + "out_h / out_w, it will return false"; + return false; + } + auto out_h = op->attribute("out_h").data(); + auto out_w = op->attribute("out_w").data(); + if (!(out_h <= 0 && out_w <= 0)) { + if (out_h <= 0) { + VLOG(3) << "BilinearInterpV2 out_h must be greater than 0 if scale " + "is not set."; + return false; + } + if (out_w <= 0) { + VLOG(3) << "BilinearInterpV2 out_w must be greater than 0 if scale " + "is not set."; + return false; + } + } + } else { + for (size_t i = 0; i < scale.size(); i++) { + if (scale[i] <= 0) { + VLOG(3) << "BilinearInterpV2 dynamic shape not support Attr(scale[" + << i << "]" << scale[i] + << " less than 1 and Input(Scale) Vector not set."; + return false; + } + } + } + } + + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); + return true; + } +}; + +class NearestInterV2Pattern + : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern< + paddle::dialect::NearestInterpOp>::OpRewritePattern; + bool MatchAndRewrite(paddle::dialect::NearestInterpOp op, + pir::PatternRewriter &rewriter) const override { + if (op->HasAttribute(kCanRunTrtAttr) && + op->attribute(kCanRunTrtAttr).data()) { + return false; + } + const std::vector required_attrs = {"data_format", + "interp_method", + "align_corners", + "scale", + "out_h", + "out_w"}; + for (const auto &attr : required_attrs) { + if (!op->HasAttribute(attr)) { + VLOG(3) << "NearestInterV2 " << attr << " attribute does not exist"; + return false; + } + } + + pir::Value size_tensor = op.operand_source(2); + + auto data_format = + op->attribute("data_format").AsString(); + if (data_format != "NCHW" && data_format != "NHWC") { + VLOG(3) << "NearestInterV2: data format must be NCHW or NHWC"; + return false; + } + auto interp_method = + op->attribute("interp_method").AsString(); + if (interp_method != "nearest") { + VLOG(3) << "The interp_method of NearestInterV2 is not nearest"; + return false; + } + bool has_size_input = false; + if (size_tensor) { + has_size_input = true; + } + +#if IS_TRT_VERSION_GE(8200) + if (has_size_input) { + auto size_tensor_type = size_tensor.type(); + if (size_tensor_type.isa()) { + auto vector_type = size_tensor.type().dyn_cast(); + if (vector_type.size() == 2) { + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); + return true; + } + } + } +#endif + + if (op->HasAttribute("scale")) { + std::vector scale; + auto scale_attr = op->attribute("scale"); + for (const auto &attr : scale_attr.AsVector()) { + scale.push_back(attr.dyn_cast().data()); + } + auto out_h = op->attribute("out_h").data(); + auto out_w = op->attribute("out_w").data(); + if (!(out_h > 0 && out_w > 0)) { + if (scale.size() < 2) { + VLOG(3) << "NearestInterV2 scale attribute size < 2"; + return false; + } + if (scale[0] <= 0.f || scale[1] <= 0.f) { + VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is " + "not set."; + return false; + } + } + } + + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); + return true; + } +}; + class TrtOpMarkerPass : public pir::PatternRewritePass { public: TrtOpMarkerPass() : pir::PatternRewritePass("trt_op_marker_pass", 2) {} @@ -1352,6 +1533,8 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); + ps.Add(std::make_unique(context)); + ps.Add(std::make_unique(context)); return ps; } }; diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 4af300760df5e..7446f32184870 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -31,6 +31,7 @@ from .impls.activation import * # noqa: F403 from .impls.attribute import * # noqa: F403 +from .impls.common import * # noqa: F403 from .impls.conv import * # noqa: F403 from .impls.creation import * # noqa: F403 from .impls.linalg import * # noqa: F403 diff --git a/python/paddle/tensorrt/impls/common.py b/python/paddle/tensorrt/impls/common.py new file mode 100644 index 0000000000000..5fd08d0c0ecfa --- /dev/null +++ b/python/paddle/tensorrt/impls/common.py @@ -0,0 +1,248 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorrt as trt + +from paddle.tensorrt.converter_utils import get_shape_tensor_element +from paddle.tensorrt.register import converter_registry + + +@converter_registry.register("pd_op.bilinear_interp", trt_version="8.x") +def bilinear_interp_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + input_shape = paddle_op.operands()[0].source().shape + data_format = paddle_op.attrs().get("data_format") + interp_method = paddle_op.attrs().get("interp_method") + align_corners = paddle_op.attrs().get("align_corners") + align_mode = paddle_op.attrs().get("align_mode") + out_h = paddle_op.attrs().get("out_h") + out_w = paddle_op.attrs().get("out_w") + out_d = paddle_op.attrs().get("out_d") + scale_attr = paddle_op.attrs().get("scale") + + trt_major, trt_minor, trt_patch = trt.__version__.split(".") + trt_version_float = float(f"{trt_major}.{trt_minor}") + + resize_layer = network.add_resize(input_tensor) + # Set resize mode to LINEAR unconditionally + if trt_version_float >= 8.6: + resize_layer.resize_mode = trt.InterpolationMode.LINEAR + else: + resize_layer.resize_mode = trt.ResizeMode.LINEAR + + # Set coordinate transformation based on align_corners and align_mode + if align_corners: + resize_layer.coordinate_transformation = ( + trt.ResizeCoordinateTransformation.ALIGN_CORNERS + ) + else: + if align_mode == 0: + resize_layer.coordinate_transformation = ( + trt.ResizeCoordinateTransformation.HALF_PIXEL + ) + else: # align_mode == 1 + resize_layer.coordinate_transformation = ( + trt.ResizeCoordinateTransformation.ASYMMETRIC + ) + + if data_format == "NCHW": + h_axis = 2 + w_axis = 3 + elif data_format == "NHWC": + h_axis = 1 + w_axis = 2 + + in_dim = input_tensor.shape + + outsize_tensor = None + if trt_version_float >= 8.2: + if len(inputs) > 1 and inputs[1] is not None: + output_tensor_operand = paddle_op.operands()[1].source() + outsize_tensor = inputs[1] + + use_scales = True + if outsize_tensor is not None: + use_scales = False + elif out_h > 0 and out_w > 0 and scale_attr is not None: + use_scales = True + + if use_scales: + scale_h = -1.0 + scale_w = -1.0 + + if scale_attr and len(scale_attr) > 1: + scale_h = scale_attr[0] + scale_w = scale_attr[1] + elif scale_attr and len(scale_attr) == 1: + scale_h = scale_w = scale_attr[0] + + if scale_w > 0 and scale_h > 0: + if in_dim[h_axis] > 0 and in_dim[w_axis] > 0: + out_h = int(in_dim[h_axis] * scale_h) + out_w = int(in_dim[w_axis] * scale_w) + else: + if out_h > 0 and out_w > 0 and not (scale_w > 0 and scale_h > 0): + if in_dim[h_axis] > 0 and in_dim[w_axis] > 0: + scale_h = float(out_h) / float(in_dim[h_axis]) + scale_w = float(out_w) / float(in_dim[w_axis]) + + scales = [1.0] * len(input_tensor.shape) + if data_format == "NCHW": + scales[2] = scale_h + scales[3] = scale_w + elif data_format == "NHWC": + scales[1] = scale_h + scales[2] = scale_w + + resize_layer.scales = scales + else: + if outsize_tensor is not None: + outsize_itensors = [] + input_shape_tensor = network.add_shape(input_tensor).get_output(0) + batch_dim = get_shape_tensor_element(network, input_shape_tensor, 0) + outsize_itensors.append(batch_dim) + if data_format == "NCHW": + channel_dim = get_shape_tensor_element( + network, input_shape_tensor, 1 + ) + outsize_itensors.append(channel_dim) + outsize_itensors.append(outsize_tensor) + elif data_format == "NHWC": + channel_dim = get_shape_tensor_element( + network, input_shape_tensor, 3 + ) + outsize_itensors.append(outsize_tensor) + outsize_itensors.append(channel_dim) + output_size_tensor = network.add_concatenation( + outsize_itensors + ).get_output(0) + resize_layer.set_input(1, output_size_tensor) + + return resize_layer.get_output(0) + + +@converter_registry.register("pd_op.nearest_interp", trt_version="8.x") +def nearest_interp_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + input_shape = paddle_op.operands()[0].source().shape + data_format = paddle_op.attrs().get("data_format") + interp_method = paddle_op.attrs().get("interp_method") + align_corners = paddle_op.attrs().get("align_corners") + out_h = paddle_op.attrs().get("out_h") + out_w = paddle_op.attrs().get("out_w") + out_d = paddle_op.attrs().get("out_d") + scale_attr = paddle_op.attrs().get("scale") + + # Parse TensorRT version + trt_major, trt_minor, trt_patch = trt.__version__.split(".") + trt_version_float = float(f"{trt_major}.{trt_minor}") + + # Create Resize layer + resize_layer = network.add_resize(input_tensor) + + if trt_version_float >= 8.6: + if align_corners: + resize_layer.coordinate_transformation = ( + trt.ResizeCoordinateTransformation.ASYMMETRIC + ) + else: + resize_layer.coordinate_transformation = ( + trt.ResizeCoordinateTransformation.ASYMMETRIC + ) + + in_dim = input_tensor.shape + scale_h = 1.0 + scale_w = 1.0 + + if scale_attr is not None and len(scale_attr) >= 2: + scale_h = scale_attr[0] + scale_w = scale_attr[1] + else: + if out_h > 0 and out_w > 0: + if data_format == "NCHW": + h_axis = 2 + w_axis = 3 + elif data_format == "NHWC": + h_axis = 1 + w_axis = 2 + + scale_h = float(out_h) / float(in_dim[h_axis]) + scale_w = float(out_w) / float(in_dim[w_axis]) + + outsize_tensor = None + if trt_version_float >= 8.2: + if len(inputs) > 2 and inputs[2] is not None: + size_tensor_operand = paddle_op.operands()[2].source() + if size_tensor_operand.is_combine(): + size_tensors = inputs[2] + if not isinstance(size_tensors, list): + size_tensors = [size_tensors] + if len(size_tensors) >= 2: + # Extract the first two elements representing height and width + outsize_h = size_tensors[0] + outsize_w = size_tensors[1] + outsize_tensor = network.add_concatenation( + [outsize_h, outsize_w] + ).get_output(0) + else: + size_tensor_shape = size_tensor_operand.source().shape + if size_tensor_shape.size >= 2: + size_tensor = inputs[2] + outsize_h = network.add_slice( + size_tensor, start=[0], shape=[1], stride=[1] + ).get_output(0) + outsize_w = network.add_slice( + size_tensor, start=[1], shape=[1], stride=[1] + ).get_output(0) + outsize_tensor = network.add_concatenation( + [outsize_h, outsize_w] + ).get_output(0) + + scales = [1.0] * len(input_tensor.shape) + if data_format == "NCHW": + scales[1] = 1.0 + scales[2] = scale_h + scales[3] = scale_w + elif data_format == "NHWC": + scales[1] = scale_h + scales[2] = scale_w + scales[3] = 1.0 + else: + raise ValueError( + f"Unsupported data format {data_format}, only NCHW or NHWC are supported." + ) + if outsize_tensor is not None: + outsize_itensors = [] + input_shape_tensor = network.add_shape(input_tensor).get_output(0) + batch_dim = get_shape_tensor_element(network, input_shape_tensor, 0) + outsize_itensors.append(batch_dim) + if data_format == "NCHW": + channel_dim = get_shape_tensor_element( + network, input_shape_tensor, 1 + ) + outsize_itensors.append(channel_dim) + outsize_itensors.append(outsize_tensor) + elif data_format == "NHWC": + channel_dim = get_shape_tensor_element( + network, input_shape_tensor, 3 + ) + outsize_itensors.append(outsize_tensor) + outsize_itensors.append(channel_dim) + resize_layer.set_input( + 1, network.add_concatenation(outsize_itensors).get_output(0) + ) + else: + resize_layer.scales = scales + + return resize_layer.get_output(0) diff --git a/test/tensorrt/CMakeLists.txt b/test/tensorrt/CMakeLists.txt index ea789f7e75494..15fa3c0e4af1b 100644 --- a/test/tensorrt/CMakeLists.txt +++ b/test/tensorrt/CMakeLists.txt @@ -22,4 +22,5 @@ if(NOT WIN32 AND TENSORRT_FOUND) set_tests_properties(test_converter_manipulation PROPERTIES TIMEOUT "100") set_tests_properties(test_converter_creation PROPERTIES TIMEOUT "100") set_tests_properties(test_converter_attribute PROPERTIES TIMEOUT "100") + set_tests_properties(test_converter_common PROPERTIES TIMEOUT "300") endif() diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py index 1878ef63d9c4e..7db19fe966b75 100755 --- a/test/tensorrt/tensorrt_test_base.py +++ b/test/tensorrt/tensorrt_test_base.py @@ -51,17 +51,30 @@ def create_fake_program(self): for sub_arg_name, sub_arg_value in self.api_args[ feed_name ].items(): - input_shape_without_dynamic_dim = sub_arg_value.shape[ - 1: - ] - input_dynamic_shape = [-1] - input_dynamic_shape.extend( - input_shape_without_dynamic_dim - ) + + if ( + feed_name in self.min_shape.keys() + and feed_name in self.max_shape.keys() + ): + input_shape_without_dynamic_dim = ( + sub_arg_value.shape[1:] + ) + input_dynamic_shape = [-1] + input_dynamic_shape.extend( + input_shape_without_dynamic_dim + ) + input_shape = input_dynamic_shape + else: + input_shape = [] + input_shape_without_dynamic_dim = ( + sub_arg_value.shape[0:] + ) + input_shape.extend(input_shape_without_dynamic_dim) + input_dtype = sub_arg_value.dtype input_data = paddle.static.data( name=sub_arg_name, - shape=input_dynamic_shape, + shape=input_shape, dtype=input_dtype, ) new_list_args.append(input_data) @@ -85,6 +98,7 @@ def create_fake_program(self): input_shape = self.api_args[feed_name].shape input_dtype = self.api_args[feed_name].dtype + input_data = paddle.static.data( name=feed_name, shape=input_shape, @@ -156,30 +170,48 @@ def check_trt_result(self, rtol=1e-5, atol=1e-5): min_shape_data = dict() # noqa: C408 max_shape_data = dict() # noqa: C408 for feed_name in self.program_config["feed_list"]: - if ( - feed_name not in self.min_shape.keys() - and feed_name not in self.max_shape.keys() - ): - min_shape_data[feed_name] = self.api_args[feed_name] - max_shape_data[feed_name] = self.api_args[feed_name] - continue - if isinstance(self.api_args[feed_name], dict): - for i in range(len(self.min_shape[feed_name])): - sub_feed_name = feed_name + str(i) - min_shape_data[sub_feed_name] = np.random.randn( - *self.min_shape[feed_name][i] - ).astype(self.api_args[feed_name][sub_feed_name].dtype) - max_shape_data[sub_feed_name] = np.random.randn( - *self.max_shape[feed_name][i] - ).astype(self.api_args[feed_name][sub_feed_name].dtype) + # shape_tensor + if ( + feed_name not in self.min_shape.keys() + and feed_name not in self.max_shape.keys() + ): + for sub_feed_name, sub_feed_value in self.api_args[ + feed_name + ].items(): + min_shape_data[sub_feed_name] = sub_feed_value + max_shape_data[sub_feed_name] = sub_feed_value + continue + else: + # not shape_tensor + for i in range(len(self.min_shape[feed_name])): + sub_feed_name = feed_name + str(i) + min_shape_data[sub_feed_name] = np.random.randn( + *self.min_shape[feed_name][i] + ).astype( + self.api_args[feed_name][sub_feed_name].dtype + ) + max_shape_data[sub_feed_name] = np.random.randn( + *self.max_shape[feed_name][i] + ).astype( + self.api_args[feed_name][sub_feed_name].dtype + ) else: - min_shape_data[feed_name] = np.random.randn( - *self.min_shape[feed_name] - ).astype(self.api_args[feed_name].dtype) - max_shape_data[feed_name] = np.random.randn( - *self.max_shape[feed_name] - ).astype(self.api_args[feed_name].dtype) + # shape_tensor is list + if ( + feed_name not in self.min_shape.keys() + and feed_name not in self.max_shape.keys() + ): + min_shape_data[feed_name] = self.api_args[feed_name] + max_shape_data[feed_name] = self.api_args[feed_name] + continue + else: + min_shape_data[feed_name] = np.random.randn( + *self.min_shape[feed_name] + ).astype(self.api_args[feed_name].dtype) + max_shape_data[feed_name] = np.random.randn( + *self.max_shape[feed_name] + ).astype(self.api_args[feed_name].dtype) scope = paddle.static.global_scope() main_program = warmup_shape_infer( diff --git a/test/tensorrt/test_converter_common.py b/test/tensorrt/test_converter_common.py new file mode 100644 index 0000000000000..da886bb623c3d --- /dev/null +++ b/test/tensorrt/test_converter_common.py @@ -0,0 +1,264 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from tensorrt_test_base import TensorRTBaseTest + +import paddle +from paddle import _C_ops + + +def upsample_bilinear(x): + upsample = paddle.nn.Upsample(size=[12, 12], mode="bilinear") + return upsample(x) + + +def bilinear_python_api(x, OutSize, SizeTensor, Scale, attrs): + return _C_ops.bilinear_interp( + x, + OutSize, + SizeTensor, + Scale, + attrs['data_layout'], + attrs['out_d'], + attrs['out_h'], + attrs['out_w'], + attrs['scale'] if 'scale' in attrs else [], + attrs['interp_method'], + attrs['align_corners'], + attrs['align_mode'], + ) + + +def nearest_python_api(x, OutSize, SizeTensor, Scale, attrs): + return _C_ops.nearest_interp( + x, + OutSize, + SizeTensor, + Scale, + attrs['data_layout'], + attrs['out_d'], + attrs['out_h'], + attrs['out_w'], + attrs['scale'] if 'scale' in attrs else [], + attrs['interp_method'], + attrs['align_corners'], + attrs['align_mode'], + ) + + +class TestBilinearScaleTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = bilinear_python_api + self.api_args = { + "x": np.random.random([2, 3, 6, 10]).astype("float32"), + "OutSize": None, + "SizeTensor": None, + "Scale": None, + "attrs": { + "data_layout": "NCHW", + "scale": [2.0, 2.0], + "out_h": 12, + "out_w": 12, + "out_d": -1, + "interp_method": "bilinear", + "align_corners": True, + "align_mode": 1, + }, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 3, 6, 10]} + self.max_shape = {"x": [12, 3, 6, 10]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestBilinearNHWCTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = bilinear_python_api + x_nchw = np.random.random([2, 3, 6, 10]).astype("float32") + x_nhwc = np.transpose(x_nchw, (0, 2, 3, 1)) + self.api_args = { + "x": x_nhwc, + "OutSize": None, + "SizeTensor": None, + "Scale": None, + "attrs": { + "data_layout": "NHWC", + "scale": [], + "out_h": 12, + "out_w": 12, + "out_d": -1, + "interp_method": "bilinear", + "align_corners": False, + "align_mode": 0, + }, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 6, 10, 3]} + self.max_shape = {"x": [12, 6, 10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestBilinearOutSizeTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = bilinear_python_api + self.api_args = { + "x": np.random.random([2, 3, 6, 10]).astype("float32"), + "OutSize": np.array([12, 12], dtype="int32"), + "SizeTensor": None, + "Scale": None, + "attrs": { + "data_layout": "NCHW", + "scale": [], + "out_h": 12, + "out_w": 12, + "out_d": -1, + "interp_method": "bilinear", + "align_corners": False, + "align_mode": 0, + }, + } + self.program_config = {"feed_list": ["x", "OutSize"]} + self.min_shape = {"x": [2, 3, 6, 10]} + self.max_shape = {"x": [12, 3, 6, 10]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestNearestNHWCTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = nearest_python_api + x_nchw = np.random.random([2, 3, 6, 10]).astype("float32") + x_nhwc = np.transpose(x_nchw, (0, 2, 3, 1)) + self.api_args = { + "x": x_nhwc, + "OutSize": None, + "SizeTensor": None, + "Scale": None, + "attrs": { + "data_layout": "NHWC", + "scale": [], + "out_h": 12, + "out_w": 12, + "out_d": -1, + "interp_method": "nearest", + "align_corners": False, + "align_mode": 1, + }, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 6, 10, 3]} + self.max_shape = {"x": [12, 6, 10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestNearestSizeTensorTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = nearest_python_api + x_nchw = np.random.random([2, 3, 6, 10]).astype("float32") + self.api_args = { + "x": x_nchw, + "OutSize": None, + "SizeTensor": [ + np.array([12], dtype="int32"), + np.array([12], dtype="int32"), + ], + "Scale": None, + "attrs": { + "data_layout": "NCHW", + "scale": [], + "out_h": 12, + "out_w": 12, + "out_d": -1, + "interp_method": "nearest", + "align_corners": False, + "align_mode": 0, + }, + } + self.program_config = {"feed_list": ["x", "SizeTensor"]} + self.min_shape = {"x": [2, 3, 6, 10]} + self.max_shape = {"x": [12, 3, 6, 10]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestNearestOutAndScaleTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = nearest_python_api + x_nchw = np.random.random([2, 3, 6, 10]).astype("float32") + self.api_args = { + "x": x_nchw, + "OutSize": None, + "SizeTensor": None, + "Scale": None, + "attrs": { + "data_layout": "NCHW", + "scale": [2, 2], + "out_h": 12, + "out_w": 12, + "out_d": -1, + "interp_method": "nearest", + "align_corners": True, + "align_mode": 1, + }, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 3, 6, 10]} + self.max_shape = {"x": [12, 3, 6, 10]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestBilinearTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = upsample_bilinear + self.api_args = {"x": np.random.random([2, 3, 6, 10]).astype("float32")} + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 3, 6, 10]} + self.max_shape = {"x": [12, 3, 6, 10]} + + def test_trt_result(self): + self.check_trt_result() + + +def upsample_nearest(x): + upsample = paddle.nn.Upsample(size=[12, 12], mode="nearest") + return upsample(x) + + +class TestNearestInterpTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = upsample_nearest + self.api_args = {"x": np.random.random([2, 3, 6, 10]).astype("float32")} + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 3, 6, 10]} + self.max_shape = {"x": [12, 3, 6, 10]} + + def test_trt_result(self): + self.check_trt_result() + + +if __name__ == "__main__": + unittest.main() From 75dfcf2e27d6e1463e9886f287d68c013e30efbd Mon Sep 17 00:00:00 2001 From: aooxin <59520374+aooxin@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:04:28 +0800 Subject: [PATCH 056/135] [Inference] refine some code (#68600) Co-authored-by: yuanlehome --- paddle/fluid/pir/dialect/CMakeLists.txt | 1 + .../interface/layout_transformation.cc | 4 +- .../general/transfer_layout_pass.cc | 4 +- paddle/fluid/pir/utils/general_functions.cc | 27 ++++++++++++- paddle/fluid/pir/utils/general_functions.h | 11 ++++++ paddle/pir/include/pass/utils.h | 24 ------------ paddle/pir/src/pass/utils.cc | 39 ------------------- 7 files changed, 41 insertions(+), 69 deletions(-) delete mode 100644 paddle/pir/include/pass/utils.h delete mode 100644 paddle/pir/src/pass/utils.cc diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt index a96b2ed12212c..49888391b0a84 100644 --- a/paddle/fluid/pir/dialect/CMakeLists.txt +++ b/paddle/fluid/pir/dialect/CMakeLists.txt @@ -309,6 +309,7 @@ set(op_dialect_deps type_info string_helper global_utils + pir_general_functions amp) if(WITH_ROCM) set(op_dialect_deps ${op_dialect_deps} global_utils) diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc index 135882330a4fc..aaa394128bf66 100644 --- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc +++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc @@ -17,10 +17,10 @@ #include "paddle/common/ddim.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/fluid/pir/utils/general_functions.h" #include "paddle/phi/common/scalar.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/ir_context.h" -#include "paddle/pir/include/pass/utils.h" namespace paddle::dialect { @@ -72,7 +72,7 @@ void RewriteByLayoutImpl(pir::Operation* op, } for (auto value : RelevantOutputsImpl(op)) { - SetNewLayoutForValue(value, new_layout); + pir::SetNewLayoutForValue(value, new_layout); } } diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc index f0536e808db37..05c32b6ebfe04 100644 --- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc +++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc @@ -35,13 +35,13 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/fluid/pir/utils/general_functions.h" #include "paddle/pir/include/core/builtin_dialect.h" #include "paddle/pir/include/core/ir_context.h" #include "paddle/pir/include/core/program.h" #include "paddle/pir/include/pass/pass.h" #include "paddle/pir/include/pass/pass_manager.h" #include "paddle/pir/include/pass/pass_registry.h" -#include "paddle/pir/include/pass/utils.h" struct Node; @@ -655,7 +655,6 @@ class TransferLayoutPass : public pir::Pass { for (auto op : ops) { VLOG(10) << op << ","; } - VLOG(10); } VLOG(10) << "-----------------------[op src set]------------------------"; @@ -795,7 +794,6 @@ class TransferLayoutPass : public pir::Pass { for (const auto& op : operation_set) { VLOG(10) << " op: " << op << ","; } - VLOG(10); const auto& perm = ((src_set.count(node) > 0) ? layout_to_perm("NCHW", "NHWC") : layout_to_perm("NHWC", "NCHW")); diff --git a/paddle/fluid/pir/utils/general_functions.cc b/paddle/fluid/pir/utils/general_functions.cc index 8bd2e5cb06f17..252cde4b17875 100644 --- a/paddle/fluid/pir/utils/general_functions.cc +++ b/paddle/fluid/pir/utils/general_functions.cc @@ -23,11 +23,13 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" + #include "paddle/pir/include/core/builtin_op.h" +#include "paddle/pir/include/core/builtin_type.h" +#include "paddle/pir/include/core/ir_context.h" #include "paddle/pir/include/core/op_operand.h" #include "paddle/pir/include/core/operation.h" #include "paddle/pir/include/core/program.h" -#include "paddle/pir/include/core/value.h" namespace { @@ -164,4 +166,27 @@ bool ValueIsPersistable(const pir::Value& value) { return true; } +void SetNewLayoutForValue(pir::Value value, + const common::DataLayout& new_layout) { + if (!value || !value.type()) { + return; + } + auto tensor_type = value.type().dyn_cast(); + if (!tensor_type) { + return; + } + auto old_layeut = tensor_type.data_layout(); + if (old_layeut == new_layout) { + return; + } + + auto new_tensor_type = pir::DenseTensorType::get(pir::IrContext::Instance(), + tensor_type.dtype(), + tensor_type.dims(), + new_layout, + tensor_type.lod(), + tensor_type.offset()); + value.set_type(new_tensor_type); +} + } // namespace pir diff --git a/paddle/fluid/pir/utils/general_functions.h b/paddle/fluid/pir/utils/general_functions.h index e2c655804def5..96b87bf6ba396 100644 --- a/paddle/fluid/pir/utils/general_functions.h +++ b/paddle/fluid/pir/utils/general_functions.h @@ -17,7 +17,9 @@ #include #include +#include "paddle/common/layout.h" #include "paddle/pir/include/core/type.h" +#include "paddle/pir/include/core/value.h" namespace pir { @@ -107,4 +109,13 @@ std::vector GetUsedExternalValue(const Block& block); */ bool ValueIsPersistable(const pir::Value& value); +/** + * @brief Set the layout of an value. + * + * @param pir::Value the value to be process + * @param const DataLayout& new layout + * + */ +void SetNewLayoutForValue(pir::Value value, const common::DataLayout& layout); + } // namespace pir diff --git a/paddle/pir/include/pass/utils.h b/paddle/pir/include/pass/utils.h deleted file mode 100644 index 9a2cbc0274793..0000000000000 --- a/paddle/pir/include/pass/utils.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/common/layout.h" -#include "paddle/pir/include/core/value.h" - -namespace pir { - -void SetNewLayoutForValue(pir::Value value, common::DataLayout new_layout); - -} // namespace pir diff --git a/paddle/pir/src/pass/utils.cc b/paddle/pir/src/pass/utils.cc deleted file mode 100644 index f866d7beaf8a2..0000000000000 --- a/paddle/pir/src/pass/utils.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/pir/include/pass/utils.h" - -#include "paddle/pir/include/core/builtin_type.h" -#include "paddle/pir/include/core/ir_context.h" - -namespace pir { - -void SetNewLayoutForValue(pir::Value value, common::DataLayout new_layout) { - if (!value || !value.type()) { - return; - } - auto tensor_type = value.type().dyn_cast(); - if (!tensor_type) { - return; - } - auto new_tensor_type = pir::DenseTensorType::get(pir::IrContext::Instance(), - tensor_type.dtype(), - tensor_type.dims(), - new_layout, - tensor_type.lod(), - tensor_type.offset()); - value.set_type(new_tensor_type); -} - -} // namespace pir From bddc5a833afd76ad590bf2951408f0ed37ea234f Mon Sep 17 00:00:00 2001 From: chen2016013 <111894720+chen2016013@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:21:28 +0800 Subject: [PATCH 057/135] =?UTF-8?q?=E3=80=90CINN=E3=80=91=20Decoupling=20r?= =?UTF-8?q?elationship=20between=20'FLAGS=5Fenable=5Fdist=5Fprim=5Fall'=20?= =?UTF-8?q?and=20'FLAGS=5Fenable=5Fauto=5Frecompute'=20(#68579)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix recompute bug * update --- python/paddle/base/executor.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index fec12eb1eb97a..b2513c8148c00 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1220,12 +1220,11 @@ def _get_pir_program_and_executor(self, cached_data): if core._enable_dist_prim_all(): with decomp.prim_guard(): - pir_grad_var_to_var = decomp.decompose_dist_program(program) - if core._enable_auto_recompute(): - print("apply auto_recompute in executor", flush=True) - program = decomp.auto_recompute_pir_program( - program, pir_grad_var_to_var - ) + decomp.decompose_dist_program(program) + + if core._enable_auto_recompute(): + logging.info("apply auto_recompute in executor") + program = decomp.auto_recompute_pir_program(program, None) if in_cinn_mode(): apply_cinn_pass(program) From 68ca08807c773be55c92f0d54349b85645760913 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:27:52 +0800 Subject: [PATCH 058/135] [XPU] fix gather_nd on zero-dim index (#68585) * [XPU] fix gather_nd on zero-dim index * [XPU] fix gather_nd on zero-dim index --- paddle/phi/kernels/cpu/gather_nd_kernel.cc | 2 +- paddle/phi/kernels/gpu/gather_nd_kernel.cu | 2 +- paddle/phi/kernels/xpu/gather_nd_kernel.cc | 5 ++--- test/xpu/test_gather_nd_op_xpu.py | 24 ++++++++++++++++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc index 5c21850d7bc94..7ae7e10189227 100644 --- a/paddle/phi/kernels/cpu/gather_nd_kernel.cc +++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc @@ -26,7 +26,7 @@ void GatherNdKernel(const Context &ctx, const DenseTensor &index, DenseTensor *out) { ctx.template Alloc(out); - if (x.numel() == 0) return; + if (x.numel() == 0 || out->numel() == 0) return; if (index.dims()[0] == 0 && index.numel() == 0) return; auto index_type = index.dtype(); bool index_type_match = diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu index 51bb2bd772b82..77908b67fa23c 100644 --- a/paddle/phi/kernels/gpu/gather_nd_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu @@ -27,7 +27,7 @@ void GatherNdKernel(const Context &ctx, const DenseTensor &index, DenseTensor *out) { ctx.template Alloc(out); - if (x.numel() == 0) return; + if (x.numel() == 0 || out->numel() == 0) return; if (index.dims()[0] == 0 && index.numel() == 0) return; const auto &index_type = index.dtype(); bool index_type_match = diff --git a/paddle/phi/kernels/xpu/gather_nd_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_kernel.cc index 05c4add37d8ff..1c897490d5ae0 100644 --- a/paddle/phi/kernels/xpu/gather_nd_kernel.cc +++ b/paddle/phi/kernels/xpu/gather_nd_kernel.cc @@ -27,9 +27,8 @@ void GatherNdKernel(const Context &ctx, using XPUType = typename XPUTypeTrait::Type; ctx.template Alloc(out); - if (x.numel() == 0) { - return; - } + if (x.numel() == 0 || out->numel() == 0) return; + if (index.dims()[0] == 0 && index.numel() == 0) return; if (index.numel() == 0) { auto index_dims = index.dims(); diff --git a/test/xpu/test_gather_nd_op_xpu.py b/test/xpu/test_gather_nd_op_xpu.py index fb1c9c476dd19..8be473e52eb62 100644 --- a/test/xpu/test_gather_nd_op_xpu.py +++ b/test/xpu/test_gather_nd_op_xpu.py @@ -178,5 +178,29 @@ def init_data(self): for stype in support_types: create_test_class(globals(), XPUTestGatherNd, stype) + +class TestZeroDimIndex(unittest.TestCase): + def setUp(self): + paddle.disable_static() + # shape of x: [2, 3, 2] + self.x = paddle.to_tensor( + [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]] + ) + + def test_1(self): + index = np.zeros((0, 1)).astype("int") + index = paddle.to_tensor(index) + output = paddle.gather_nd(self.x, index) + self.assertEqual(output.numel().numpy(), 0) + self.assertEqual(output.shape, [0, 3, 2]) + + def test_2(self): + index = np.zeros((2, 0, 1)).astype("int") + index = paddle.to_tensor(index) + output = paddle.gather_nd(self.x, index) + self.assertEqual(output.numel().numpy(), 0) + self.assertEqual(output.shape, [2, 0, 3, 2]) + + if __name__ == "__main__": unittest.main() From ad7110dc135de9f5e1d9fc6b8f888d1c94471147 Mon Sep 17 00:00:00 2001 From: waliwali777 Date: Thu, 10 Oct 2024 14:29:38 +0800 Subject: [PATCH 059/135] =?UTF-8?q?=E3=80=90Comm=E3=80=91switch=20alltoall?= =?UTF-8?q?=20in=20fluid=20to=20all=5Fto=5Fall=20in=20phi=20(#66883)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * switch send_V2 & recv_V2 in fluid to p_send & p_recv in phi * replace op name in program operator --- .../framework/new_executor/pir_interpreter.cc | 2 +- paddle/phi/ops/yaml/legacy/static_ops.yaml | 10 ---------- paddle/phi/ops/yaml/ops.yaml | 10 ++++++++++ .../communication/stream/all_to_all.py | 11 ++++++----- test/collective/test_collective_alltoall_api.py | 16 +++++++++++----- test/legacy_test/test_collective_api_base.py | 2 +- test/xpu/test_collective_api_base.py | 2 +- 7 files changed, 30 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index f888c4c502981..fed1f5684f299 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -542,7 +542,7 @@ void PirInterpreter::UpdateNcclOpNum() { "pd_op.send_v2", "pd_op.mp_allreduce_sum", "pd_op.barrier", - "pd_op.alltoall", + "pd_op.all_to_all", "pd_op.global_gather", "pd_op.distributed_fused_lamb", "pd_op.margin_cross_entropy", diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml index 498a07004c564..8705103f1f48c 100755 --- a/paddle/phi/ops/yaml/legacy/static_ops.yaml +++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml @@ -17,16 +17,6 @@ func : all_reduce param: [x, reduce_type] -- op : all_to_all - args : (Tensor x, int ring_id = 0) - output : Tensor(out) - infer_meta : - func : AllToAllInferMeta - param: [x] - kernel : - func : all_to_all - param: [x] - - op : amax args : (Tensor x, IntArray axis={0}, bool keepdim=false, bool reduce_all=false, int in_dtype=-1, int out_dtype=-1) output : Tensor(out) diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 3139f76f10078..f4e9b876ce538 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -192,6 +192,16 @@ func : all_gather param: [x, nranks] +- op : all_to_all + args : (Tensor x, int ring_id = 0) + output : Tensor(out) + infer_meta : + func : AllToAllInferMeta + param: [x] + kernel : + func : all_to_all + param: [x] + - op : allclose args : (Tensor x, Tensor y, Scalar(double) rtol=1e-5, Scalar(double) atol=1e-8, bool equal_nan=false) output : Tensor(out) diff --git a/python/paddle/distributed/communication/stream/all_to_all.py b/python/paddle/distributed/communication/stream/all_to_all.py index 9e213144fdaf6..7d1750e487f16 100644 --- a/python/paddle/distributed/communication/stream/all_to_all.py +++ b/python/paddle/distributed/communication/stream/all_to_all.py @@ -87,7 +87,7 @@ def _all_to_all_in_static_mode( sync_op: bool, use_calc_stream: bool, ) -> None: - op_type = 'alltoall' + op_type = 'all_to_all' ring_id = 0 if group is None else group.id nranks = dist.get_world_size() helper = framework.LayerHelper(op_type, **locals()) @@ -118,15 +118,16 @@ def _all_to_all_in_static_mode( ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'], 'all_to_all', ) - helper.append_op( + op = helper.append_op( type=op_type, - inputs={'X': [in_tensor]}, - outputs={'Out': [out_tensor]}, + inputs={'x': [in_tensor]}, + outputs={'out': [out_tensor]}, attrs={ 'ring_id': ring_id, - 'use_calc_stream': sync_op, }, ) + if sync_op: + op.dist_attr.execution_stream = "default" # NOTE(liyurui): If the argument `out_tensor_or_tensor_list` is a tensor_list, # we need to split the result. So we should wait the result of all_to_all # before split if the communication is not on calc stream. diff --git a/test/collective/test_collective_alltoall_api.py b/test/collective/test_collective_alltoall_api.py index 5d4703cae7808..5c3bb4c056006 100644 --- a/test/collective/test_collective_alltoall_api.py +++ b/test/collective/test_collective_alltoall_api.py @@ -25,22 +25,24 @@ class TestCollectiveAllToAllAPI(TestDistBase): def _setup_config(self): pass - def test_alltoall_nccl_with_comm_context(self): + def test_alltoall_nccl_with_new_comm(self): dtypes_to_test = [ + "float16", "float32", + "float64", + "int32", + "int64", ] - if self._nccl_version >= 21000: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place( "collective_alltoall_api.py", "alltoall", "nccl", dtype=dtype, - need_envs={"USE_COMM_CONTEXT": "1"}, + need_envs={"FLAGS_dynamic_static_unified_comm": "true"}, ) - def test_alltoall_nccl_with_new_comm(self): + def test_alltoall_nccl_with_new_comm_pir(self): dtypes_to_test = [ "float16", "float32", @@ -54,6 +56,10 @@ def test_alltoall_nccl_with_new_comm(self): "alltoall", "nccl", dtype=dtype, + need_envs={ + "FLAGS_dynamic_static_unified_comm": "true", + "FLAGS_enable_pir_in_executor": "1", + }, ) def test_alltoall_nccl_dygraph(self): diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py index 6b0f27987ac96..e86a19f8ab8ab 100644 --- a/test/legacy_test/test_collective_api_base.py +++ b/test/legacy_test/test_collective_api_base.py @@ -508,7 +508,7 @@ def convertbf16(origin): np.testing.assert_allclose( result_data, need_result, rtol=1e-05, atol=1e-05 ) - elif col_type == "alltoall": + elif col_type == "all_to_all": need_result1 = np.vstack( ( input1[0 : input1.shape[0] // 2, :], diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py index 1fda68613b233..b3a77101949a3 100644 --- a/test/xpu/test_collective_api_base.py +++ b/test/xpu/test_collective_api_base.py @@ -501,7 +501,7 @@ def convertbf16(origin): np.testing.assert_allclose( result_data, need_result, rtol=1e-05, atol=1e-05 ) - elif col_type == "alltoall": + elif col_type == "all_to_all": need_result1 = np.vstack( ( input1[0 : input1.shape[0] // 2, :], From 998d5a4e2e22cfd45b5dfc0ac8a8952f355f9b72 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 10 Oct 2024 14:57:22 +0800 Subject: [PATCH 060/135] Fix (#68581) --- paddle/fluid/operators/controlflow/feed_op.cc | 37 ++----------------- .../fluid/operators/ops_signature/feed_sig.cc | 9 +---- 2 files changed, 4 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 73a5962847bd3..cd5a9285a5036 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -68,28 +68,12 @@ void FeedDenseTensorKernel(const Context& dev_ctx, if (phi::is_same_place(in_tensor.place(), place)) { out->ShareDataWith(in_tensor); } else { - framework::TensorCopy(in_tensor, place, dev_ctx, out); + phi::Copy(dev_ctx, in_tensor, place, false, out); } out->set_lod(in_tensor.lod()); } -template -void FeedStringsKernel(const Context& dev_ctx UNUSED, - const phi::ExtendedTensor& x, - int col, - phi::ExtendedTensor* out) { - PADDLE_ENFORCE_NOT_NULL( - out, - common::errors::NotFound( - "Output cannot be found in scope for operator 'Feed'")); - const auto& feed_item = CheckAndGetFeedItem(x, col); - auto strs_out = static_cast(out); - const auto& in_str = paddle::get(feed_item); - strs_out->resize(in_str.size()); - *strs_out = in_str; -} - class FeedOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; @@ -119,24 +103,9 @@ class FeedOp : public framework::OperatorWithKernel { meta.strides = meta.calc_strides(meta.dims); } out_tensor->set_meta(meta); - } else if (feed_item.index() == 1) { // Strings - auto& feed_str = PADDLE_GET_CONST(framework::Strings, feed_item); - out_var->GetMutable()->resize(feed_str.size()); - } else if (feed_item.index() == 2) { // SparseCooTensor - auto& feed_sparse_tensor = - PADDLE_GET_CONST(phi::SparseCooTensor, feed_item); - out_var->GetMutable()->set_meta( - feed_sparse_tensor.meta()); - out_var->GetMutable()->SetCoalesced( - feed_sparse_tensor.coalesced()); - out_var->GetMutable()->SetIndicesDict( - feed_sparse_tensor.GetIndicesDict()); - out_var->GetMutable()->SetKmaps( - feed_sparse_tensor.GetKmaps()); } else { PADDLE_THROW(common::errors::Unimplemented( - "Only support DenseTensor, Strings, and " - "SparseCooTensor for feed op now.")); + "Only support DenseTensor for feed op now.")); } } } @@ -197,4 +166,4 @@ REGISTER_OPERATOR( paddle::operators::FeedOpInfoMaker); PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE( - feed_dense_tensor, ALL_LAYOUT, paddle::operators::FeedDenseTensorKernel) {} + feed, ALL_LAYOUT, paddle::operators::FeedDenseTensorKernel) {} diff --git a/paddle/fluid/operators/ops_signature/feed_sig.cc b/paddle/fluid/operators/ops_signature/feed_sig.cc index e28715ce70c63..717a2ec7afcb6 100644 --- a/paddle/fluid/operators/ops_signature/feed_sig.cc +++ b/paddle/fluid/operators/ops_signature/feed_sig.cc @@ -17,16 +17,9 @@ namespace phi { KernelSignature FeedOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (ctx.IsDenseTensorOutput("Out")) { - return KernelSignature("feed_dense_tensor", {"X"}, {"col"}, {"Out"}); - } else if (ctx.IsSparseCooTensorOutput("Out")) { - return KernelSignature("feed_sparse_coo_tensor", {"X"}, {"col"}, {"Out"}); - } else { - return KernelSignature("feed_strings", {"X"}, {"col"}, {"Out"}); - } + return KernelSignature("feed", {"X"}, {"col"}, {"Out"}); } } // namespace phi -PD_REGISTER_BASE_KERNEL_NAME(feed, feed_dense_tensor); PD_REGISTER_ARG_MAPPING_FN(feed, phi::FeedOpArgumentMapping); From 33616e35f5df503006aec1490bd32f7aab519fb9 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 10 Oct 2024 15:03:31 +0800 Subject: [PATCH 061/135] Modify FetchList to PhiVector type [fluid_ops] (#68588) * Fix * Fix --- paddle/fluid/framework/feed_fetch_type.h | 17 ++++++++++------- paddle/fluid/framework/phi_tensor_base_vector.h | 4 ++++ paddle/fluid/framework/type_info.cc | 1 + 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 12bbf6a95f02a..6be31a062ef07 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -25,22 +25,25 @@ namespace paddle { namespace framework { using FeedType = paddle::variant; +using FetchType = paddle::variant; template <> struct PhiVectorType { const char *type_name = "PhiVectorFeedType"; }; -using FeedList = paddle::framework::PhiVector; +template <> +struct PhiVectorType { + const char *type_name = "PhiVectorFetchType"; +}; -using FetchType = paddle::variant; -using FetchList = std::vector; +using FeedList = paddle::framework::PhiVector; +using FetchList = paddle::framework::PhiVector; using FetchUnmergedList = std::vector>; -using FetchResultType = paddle::variant; inline bool data_is_lod_tensor(const FetchType &data) { if (data.type() == typeid(phi::DenseTensor)) { diff --git a/paddle/fluid/framework/phi_tensor_base_vector.h b/paddle/fluid/framework/phi_tensor_base_vector.h index 0784d9791cd98..1d775383de809 100644 --- a/paddle/fluid/framework/phi_tensor_base_vector.h +++ b/paddle/fluid/framework/phi_tensor_base_vector.h @@ -69,6 +69,10 @@ class PhiVector : public phi::ExtendedTensor, void emplace_back(const T& feed_data) { data_.emplace_back(feed_data); } + void emplace_back() { data_.emplace_back(); } + + void push_back(const T& feed_data) { data_.push_back(feed_data); } + void pop_back() { data_.pop_back(); } const T& operator[](size_t index) const { return data_[index]; } diff --git a/paddle/fluid/framework/type_info.cc b/paddle/fluid/framework/type_info.cc index 76e15360fa525..daa91dde9d6db 100644 --- a/paddle/fluid/framework/type_info.cc +++ b/paddle/fluid/framework/type_info.cc @@ -53,4 +53,5 @@ template class TypeInfoTraits; template class TypeInfoTraits; +template class TypeInfoTraits; } // namespace phi From 0e10309b82d91752172233ff19f6d3d1b04c11f2 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Thu, 10 Oct 2024 16:11:36 +0800 Subject: [PATCH 062/135] Revert "[Inference] refine some code (#68600)" (#68609) This reverts commit 75dfcf2e27d6e1463e9886f287d68c013e30efbd. --- paddle/fluid/pir/dialect/CMakeLists.txt | 1 - .../interface/layout_transformation.cc | 4 +- .../general/transfer_layout_pass.cc | 4 +- paddle/fluid/pir/utils/general_functions.cc | 27 +------------ paddle/fluid/pir/utils/general_functions.h | 11 ------ paddle/pir/include/pass/utils.h | 24 ++++++++++++ paddle/pir/src/pass/utils.cc | 39 +++++++++++++++++++ 7 files changed, 69 insertions(+), 41 deletions(-) create mode 100644 paddle/pir/include/pass/utils.h create mode 100644 paddle/pir/src/pass/utils.cc diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt index 49888391b0a84..a96b2ed12212c 100644 --- a/paddle/fluid/pir/dialect/CMakeLists.txt +++ b/paddle/fluid/pir/dialect/CMakeLists.txt @@ -309,7 +309,6 @@ set(op_dialect_deps type_info string_helper global_utils - pir_general_functions amp) if(WITH_ROCM) set(op_dialect_deps ${op_dialect_deps} global_utils) diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc index aaa394128bf66..135882330a4fc 100644 --- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc +++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc @@ -17,10 +17,10 @@ #include "paddle/common/ddim.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" -#include "paddle/fluid/pir/utils/general_functions.h" #include "paddle/phi/common/scalar.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/ir_context.h" +#include "paddle/pir/include/pass/utils.h" namespace paddle::dialect { @@ -72,7 +72,7 @@ void RewriteByLayoutImpl(pir::Operation* op, } for (auto value : RelevantOutputsImpl(op)) { - pir::SetNewLayoutForValue(value, new_layout); + SetNewLayoutForValue(value, new_layout); } } diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc index 05c32b6ebfe04..f0536e808db37 100644 --- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc +++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc @@ -35,13 +35,13 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" -#include "paddle/fluid/pir/utils/general_functions.h" #include "paddle/pir/include/core/builtin_dialect.h" #include "paddle/pir/include/core/ir_context.h" #include "paddle/pir/include/core/program.h" #include "paddle/pir/include/pass/pass.h" #include "paddle/pir/include/pass/pass_manager.h" #include "paddle/pir/include/pass/pass_registry.h" +#include "paddle/pir/include/pass/utils.h" struct Node; @@ -655,6 +655,7 @@ class TransferLayoutPass : public pir::Pass { for (auto op : ops) { VLOG(10) << op << ","; } + VLOG(10); } VLOG(10) << "-----------------------[op src set]------------------------"; @@ -794,6 +795,7 @@ class TransferLayoutPass : public pir::Pass { for (const auto& op : operation_set) { VLOG(10) << " op: " << op << ","; } + VLOG(10); const auto& perm = ((src_set.count(node) > 0) ? layout_to_perm("NCHW", "NHWC") : layout_to_perm("NHWC", "NCHW")); diff --git a/paddle/fluid/pir/utils/general_functions.cc b/paddle/fluid/pir/utils/general_functions.cc index 252cde4b17875..8bd2e5cb06f17 100644 --- a/paddle/fluid/pir/utils/general_functions.cc +++ b/paddle/fluid/pir/utils/general_functions.cc @@ -23,13 +23,11 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" - #include "paddle/pir/include/core/builtin_op.h" -#include "paddle/pir/include/core/builtin_type.h" -#include "paddle/pir/include/core/ir_context.h" #include "paddle/pir/include/core/op_operand.h" #include "paddle/pir/include/core/operation.h" #include "paddle/pir/include/core/program.h" +#include "paddle/pir/include/core/value.h" namespace { @@ -166,27 +164,4 @@ bool ValueIsPersistable(const pir::Value& value) { return true; } -void SetNewLayoutForValue(pir::Value value, - const common::DataLayout& new_layout) { - if (!value || !value.type()) { - return; - } - auto tensor_type = value.type().dyn_cast(); - if (!tensor_type) { - return; - } - auto old_layeut = tensor_type.data_layout(); - if (old_layeut == new_layout) { - return; - } - - auto new_tensor_type = pir::DenseTensorType::get(pir::IrContext::Instance(), - tensor_type.dtype(), - tensor_type.dims(), - new_layout, - tensor_type.lod(), - tensor_type.offset()); - value.set_type(new_tensor_type); -} - } // namespace pir diff --git a/paddle/fluid/pir/utils/general_functions.h b/paddle/fluid/pir/utils/general_functions.h index 96b87bf6ba396..e2c655804def5 100644 --- a/paddle/fluid/pir/utils/general_functions.h +++ b/paddle/fluid/pir/utils/general_functions.h @@ -17,9 +17,7 @@ #include #include -#include "paddle/common/layout.h" #include "paddle/pir/include/core/type.h" -#include "paddle/pir/include/core/value.h" namespace pir { @@ -109,13 +107,4 @@ std::vector GetUsedExternalValue(const Block& block); */ bool ValueIsPersistable(const pir::Value& value); -/** - * @brief Set the layout of an value. - * - * @param pir::Value the value to be process - * @param const DataLayout& new layout - * - */ -void SetNewLayoutForValue(pir::Value value, const common::DataLayout& layout); - } // namespace pir diff --git a/paddle/pir/include/pass/utils.h b/paddle/pir/include/pass/utils.h new file mode 100644 index 0000000000000..9a2cbc0274793 --- /dev/null +++ b/paddle/pir/include/pass/utils.h @@ -0,0 +1,24 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/common/layout.h" +#include "paddle/pir/include/core/value.h" + +namespace pir { + +void SetNewLayoutForValue(pir::Value value, common::DataLayout new_layout); + +} // namespace pir diff --git a/paddle/pir/src/pass/utils.cc b/paddle/pir/src/pass/utils.cc new file mode 100644 index 0000000000000..f866d7beaf8a2 --- /dev/null +++ b/paddle/pir/src/pass/utils.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pir/include/pass/utils.h" + +#include "paddle/pir/include/core/builtin_type.h" +#include "paddle/pir/include/core/ir_context.h" + +namespace pir { + +void SetNewLayoutForValue(pir::Value value, common::DataLayout new_layout) { + if (!value || !value.type()) { + return; + } + auto tensor_type = value.type().dyn_cast(); + if (!tensor_type) { + return; + } + auto new_tensor_type = pir::DenseTensorType::get(pir::IrContext::Instance(), + tensor_type.dtype(), + tensor_type.dims(), + new_layout, + tensor_type.lod(), + tensor_type.offset()); + value.set_type(new_tensor_type); +} + +} // namespace pir From dcca128ebf170da2143cf8dc4b21e2dfdfdb925a Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle <1146009299@qq.com> Date: Thu, 10 Oct 2024 16:25:23 +0800 Subject: [PATCH 063/135] fix(pir):fix bug layer_norm kernel error when run in pir with amp_o2 use_promote (#68543) when use_promote , layer_norm bias and scale should cast to fp32 when input is fp32. --- paddle/fluid/eager/amp_auto_cast.h | 16 +++++++++++----- paddle/fluid/imperative/amp_utils.h | 21 +++++++++++++-------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h index bf25fa0c541f0..cc9d1a70c955b 100644 --- a/paddle/fluid/eager/amp_auto_cast.h +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -66,11 +66,6 @@ inline paddle::Tensor AmpAutoCast(const std::string& input_name, VLOG(6) << "AMP AmpAutoCasts: op_name(" << op_name << ") input(" << input_name << ") dst_dtype(" << phi::DataTypeToString(dst_dtype) << ")."; - if ((op_name == "batch_norm" || op_name == "layer_norm" || - op_name == "sync_batch_norm") && - input_name != "X") { - return input; - } if (op_name == "fused_softmax_mask" && input_name == "Mask" && input.dtype() == phi::DataType::FLOAT32) { return input; @@ -86,6 +81,17 @@ inline paddle::Tensor AmpAutoCast(const std::string& input_name, return input; } } + if ((op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm" || op_name == "weight_only_linear") && + input_name != "x") { + return input; + } + } else if (dst_dtype == phi::DataType::BFLOAT16) { + if ((op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm" || op_name == "weight_only_linear") && + input_name != "x") { + return input; + } } if (NeedCast(input, dst_dtype)) { diff --git a/paddle/fluid/imperative/amp_utils.h b/paddle/fluid/imperative/amp_utils.h index 4f6eaf035e31a..88d58f74bb9dd 100644 --- a/paddle/fluid/imperative/amp_utils.h +++ b/paddle/fluid/imperative/amp_utils.h @@ -279,14 +279,8 @@ inline T AmpAutoCast(const std::string& input_name, const phi::DataType& dst_dtype, const std::string& op_name, bool trace_backward = true) { - VLOG(6) << "AMP AmpAutoCasts:" - << " input(" << input_name << ") dst_dtype(" - << phi::DataTypeToString(dst_dtype) << ")."; - if ((op_name == "batch_norm" || op_name == "layer_norm" || - op_name == "sync_batch_norm" || op_name == "weight_only_linear") && - input_name != "x") { - return input; - } + VLOG(6) << "AMP AmpAutoCasts: op_name(" << op_name << ")input(" << input_name + << ") dst_dtype(" << phi::DataTypeToString(dst_dtype) << ")."; if (dst_dtype == phi::DataType::FLOAT16) { if (op_name == "run_program") { @@ -305,6 +299,17 @@ inline T AmpAutoCast(const std::string& input_name, return input; } } + if ((op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm" || op_name == "weight_only_linear") && + input_name != "x") { + return input; + } + } else if (dst_dtype == phi::DataType::BFLOAT16) { + if ((op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm" || op_name == "weight_only_linear") && + input_name != "x") { + return input; + } } if (NeedCast(input, dst_dtype)) { VLOG(6) << "Input : " << input.impl() << "NeedCast"; From b7624cf4689caff1eb47d7014c4ee1a6ce179d42 Mon Sep 17 00:00:00 2001 From: XiangGao Date: Thu, 10 Oct 2024 16:43:54 +0800 Subject: [PATCH 064/135] fix conv2d_grad error in auto parallel training (#68586) * fix conv2d_grad error in auto parallel training * add unit test --- .../manual/eager_manual/nodes/conv2d_nodes.cc | 7 +++ .../semi_auto_parallel_for_conv2d.py | 55 +++++++++++++++++++ .../test_semi_auto_parallel_basic.py | 10 ++++ 3 files changed, 72 insertions(+) create mode 100644 test/auto_parallel/semi_auto_parallel_for_conv2d.py diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc index 43f35b812dfe4..6986bcf883ed9 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -80,6 +80,13 @@ Conv2dGradNodeFinal::operator()( (out_metas[1].empty() || out_metas[1][0].IsStopGradient()) ? nullptr : &returns[1][0]; + + // Set DistAttr of Out Tensor for semi-auto parallel + if (IsRunAutoParallel()) { + egr::EagerUtils::SetGradOutputDistAttr( + out_metas, {0, 1}, api_output_0, api_output_1); + } + // Runtime check if we need next grad bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph; diff --git a/test/auto_parallel/semi_auto_parallel_for_conv2d.py b/test/auto_parallel/semi_auto_parallel_for_conv2d.py new file mode 100644 index 0000000000000..0bcf933010c48 --- /dev/null +++ b/test/auto_parallel/semi_auto_parallel_for_conv2d.py @@ -0,0 +1,55 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from semi_auto_parallel_util import SemiAutoParallelTestBase + +import paddle +import paddle.distributed as dist + + +class TestConv2dApiForSemiAutoParallel(SemiAutoParallelTestBase): + def __init__(self): + super().__init__() + + def check_placements(self, output, expected_placements): + assert ( + output.placements == expected_placements + ), f"{output.placements} vs {expected_placements}" + + def test_conv2d_shard(self): + shapes = ([8, 3, 8, 8], [6, 3, 3, 3], [6]) + specs = (['x', None, None], [None], [None]) + inputs, outputs = self.runfunc_and_check( + inputs_shape=shapes, + inputs_specs=specs, + op_func=paddle.nn.functional.conv2d, + # Todo(jeff41404): the spmd rule of conv2d_grad is fixing, after that, we can set with_backward to True. + with_backward=False, + ) + self.check_placements(outputs, [dist.Shard(0)]) + + def run_test_case(self): + if self._backend == "cpu": + paddle.set_device("cpu") + elif self._backend == "gpu": + paddle.set_device("gpu:" + str(dist.get_rank())) + else: + raise ValueError("Only support cpu or gpu backend.") + + self.test_conv2d_shard() + + +if __name__ == '__main__': + TestConv2dApiForSemiAutoParallel().run_test_case() diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py index 6b0204fc0fe8c..c5e187fd9a848 100644 --- a/test/auto_parallel/test_semi_auto_parallel_basic.py +++ b/test/auto_parallel/test_semi_auto_parallel_basic.py @@ -58,6 +58,16 @@ def test_concat_api(self): user_defined_envs=envs, ) + def test_conv2d_api(self): + envs_list = test_base.gen_product_envs_list( + self._default_envs, self._changeable_envs + ) + for envs in envs_list: + self.run_test_case( + "semi_auto_parallel_for_conv2d.py", + user_defined_envs=envs, + ) + def test_layernorm_api(self): envs_list = test_base.gen_product_envs_list( self._default_envs, self._changeable_envs From 57d95d120c37f2dc1da3fc61b1e17253dce18003 Mon Sep 17 00:00:00 2001 From: RuohengMa <120699764+RuohengMa@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:12:49 +0800 Subject: [PATCH 065/135] [XPU] bind Addmm (#68560) * [XPU] bind Addmm * fix --- paddle/phi/backends/xpu/xpu2_op_list.cc | 3 + paddle/phi/backends/xpu/xpu3_op_list.cc | 8 + .../xpu/fused_feedforward_grad_kernel.cc | 9 +- .../fusion/xpu/fused_gemm_epilogue_kernel.cc | 2 +- paddle/phi/kernels/xpu/addmm_grad_kernel.cc | 140 +++++++++ paddle/phi/kernels/xpu/addmm_kernel.cc | 170 ++++++++++ .../xpu/fused_attention_grad_kernel.cc | 16 +- paddle/phi/kernels/xpu/xpu_api_wrapper.h | 10 +- test/xpu/op_test_xpu.py | 2 +- test/xpu/test_addmm_op_xpu.py | 296 ++++++++++++++++++ 10 files changed, 637 insertions(+), 19 deletions(-) create mode 100644 paddle/phi/kernels/xpu/addmm_grad_kernel.cc create mode 100644 paddle/phi/kernels/xpu/addmm_kernel.cc create mode 100644 test/xpu/test_addmm_op_xpu.py diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 8cc61969e1b3c..cb6c28ad28953 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -46,6 +46,9 @@ XPUOpMap& get_kl2_ops() { {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})}, {"addcmul_xpu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"addmm", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"addmm_grad", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"arange_tensor", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc index 5c442fde21c51..eab77d1a70958 100644 --- a/paddle/phi/backends/xpu/xpu3_op_list.cc +++ b/paddle/phi/backends/xpu/xpu3_op_list.cc @@ -40,6 +40,14 @@ XPUOpMap& get_kl3_ops() { {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})}, {"addcmul_xpu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"addmm", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, + {"addmm_grad", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"arange_tensor", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc index 8bfc27527cfc1..8f84a73822ab5 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc @@ -246,10 +246,10 @@ void FFNGrad(const phi::XPUContext& dev_ctx, } phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, true); + xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, 0.f, true); phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, true); + xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, 0.f, true); // dropout_grad1 DropoutGrad(xpu_ctx, @@ -335,10 +335,11 @@ void FFNGrad(const phi::XPUContext& dev_ctx, std::tie(info_dx, info_dw1, a_1, b_1, a_2, b_2) = fc_info; - phi::MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, true); + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, 0.f, true); phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, true); + xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, 0.f, true); if (pre_layer_norm) { r = xpu::layer_norm_grad(xpu_ctx, diff --git a/paddle/phi/kernels/fusion/xpu/fused_gemm_epilogue_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_gemm_epilogue_kernel.cc index 6f4534d10ad3e..2fd5be7c47426 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_gemm_epilogue_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_gemm_epilogue_kernel.cc @@ -73,7 +73,7 @@ void FusedGemmEpilogueKernel(const Context& dev_ctx, "FusedGemm do not support batched fc now, but got batch size %d.", batch_size)); MatMulXPUFunction( - xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f, false, act); + xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f, 0.f, false, act); } } // namespace fusion diff --git a/paddle/phi/kernels/xpu/addmm_grad_kernel.cc b/paddle/phi/kernels/xpu/addmm_grad_kernel.cc new file mode 100644 index 0000000000000..c663da5a234c9 --- /dev/null +++ b/paddle/phi/kernels/xpu/addmm_grad_kernel.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/addmm_grad_kernel.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" + +namespace phi { + +template +void AddmmGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + float alpha, + float beta, + DenseTensor* input_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + using XPUType = typename XPUTypeTrait::Type; + + xpu::Context* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + int r; + + if (input_grad) { + dev_ctx.template Alloc(input_grad); + XPUType* input_grad_ptr = reinterpret_cast(input_grad->data()); + r = xpu::constant(xpu_ctx, input_grad_ptr, input.numel(), (XPUType)(beta)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + if (input_grad->dims().size() == 1 && out_grad.dims()[0] > 1) { + r = xpu::scale(xpu_ctx, + input_grad_ptr, + input_grad_ptr, + input_grad->numel(), + true, + static_cast(out_grad.dims()[0]), + 0.f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + } + } + if (x_grad) { + dev_ctx.template Alloc(x_grad); + } + if (y_grad) { + dev_ctx.template Alloc(y_grad); + } + + const XPUType* out_grad_ptr = + reinterpret_cast(out_grad.data()); + const XPUType* x_ptr = reinterpret_cast(x.data()); + const XPUType* y_ptr = reinterpret_cast(y.data()); + + XpuFcInfo info_forward; + GetFCInfo(x.dims(), y.dims(), false, false, &info_forward); + // begin calculate + const XPUType* a_1 = nullptr; + const XPUType* b_1 = nullptr; + const XPUType* a_2 = nullptr; + const XPUType* b_2 = nullptr; + XPUType* c_1 = reinterpret_cast(x_grad->data()); + XPUType* c_2 = reinterpret_cast(y_grad->data()); + + if (x_grad && info_forward.is_x_need_broadcast) { + c_1 = RAII_GUARD.alloc_l3_or_gm(info_forward.bs * info_forward.m * + info_forward.k); + PADDLE_ENFORCE_XDNN_NOT_NULL(c_1); + } + + if (y_grad && info_forward.is_y_need_broadcast) { + c_2 = RAII_GUARD.alloc_l3_or_gm(info_forward.bs * info_forward.k * + info_forward.n); + PADDLE_ENFORCE_XDNN_NOT_NULL(c_2); + } + + XpuFcInfo info_x_grad; + XpuFcInfo info_y_grad; + std::tuple + fc_info = MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + info_forward, + false, + false, + x_ptr, + y_ptr, + out_grad_ptr); + std::tie(info_x_grad, info_y_grad, a_1, b_1, a_2, b_2) = fc_info; + if (x_grad) { + MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_x_grad, alpha, 0.f); + if (info_forward.is_x_need_broadcast) { + r = xpu::reduce_sum( + xpu_ctx, + c_1, + reinterpret_cast(x_grad->data()), + {info_forward.bs, info_forward.m, info_forward.k}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + } + } + if (y_grad) { + MatMulXPUFunction(xpu_ctx, a_2, b_2, c_2, info_y_grad, alpha, 0.f); + if (info_forward.is_y_need_broadcast) { + r = xpu::reduce_sum( + xpu_ctx, + c_2, + reinterpret_cast(y_grad->data()), + {info_forward.bs, info_forward.k, info_forward.n}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + } + } +} +} // namespace phi + +PD_REGISTER_KERNEL(addmm_grad, + XPU, + ALL_LAYOUT, + phi::AddmmGradKernel, + float, + phi::dtype::bfloat16, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/addmm_kernel.cc b/paddle/phi/kernels/xpu/addmm_kernel.cc new file mode 100644 index 0000000000000..d075e4fc824e2 --- /dev/null +++ b/paddle/phi/kernels/xpu/addmm_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/addmm_kernel.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "xblas/cublasLt.h" + +#ifndef PADDLE_WITH_XPU_XRE5 +#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" +#endif + +namespace xblas = baidu::xpu::xblas; + +namespace phi { + +template +void AddmmKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + float beta, + float alpha, + DenseTensor* out) { + using XPUType = typename XPUTypeTrait::Type; + + auto input_dims = input.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + PADDLE_ENFORCE_EQ( + input_dims.size() == 2 || input_dims.size() == 1, + true, + common::errors::InvalidArgument( + "Variable 'input' of AddmmOp must be 1-dimensional or 2-dimensional, " + "but received shape: [%s]", + input_dims)); + PADDLE_ENFORCE_EQ(x_dims.size() == 2, + true, + common::errors::InvalidArgument( + "Variable 'x' of AddmmOp must be 2-dimensional, " + "but received shape: [%s]", + input_dims)); + PADDLE_ENFORCE_EQ(y_dims.size() == 2, + true, + common::errors::InvalidArgument( + "Variable 'y' of AddmmOp must be 2-dimensional, " + "but received shape: [%s]", + input_dims)); + + dev_ctx.template Alloc(out); + const XPUType* x_ptr = reinterpret_cast(x.data()); + const XPUType* y_ptr = reinterpret_cast(y.data()); + const XPUType* input_ptr = reinterpret_cast(input.data()); + XPUType* out_ptr = reinterpret_cast(out->data()); + + int r; + if (alpha == 0.f) { + if (beta == 0.f) { + r = xpu::constant(dev_ctx.x_context(), + out_ptr, + out->numel(), + static_cast(0.0f)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + } else { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + T* beta_xpu = RAII_GUARD.alloc_l3_or_gm(1); + r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(beta_xpu), + out->numel(), + static_cast(beta)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + auto input_dims_vec = common::vectorize(input.dims()); + auto out_dims_vec = common::vectorize(out->dims()); + r = xpu::broadcast_mul(dev_ctx.x_context(), + input_ptr, + reinterpret_cast(beta_xpu), + out_ptr, + input_dims_vec, + out_dims_vec); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); + } +#ifdef PADDLE_WITH_XPU_XRE5 + } else { + xblas::FcFusionTensor t_input{ + input_ptr, + nullptr, + input.dims()[0], + input.dims()[1], + input.dims()[1], + false, + }; + xblas::FcFusionTensor t_x{ + x_ptr, + nullptr, + x.dims()[0], + x.dims()[1], + x.dims()[1], + false, + }; + xblas::FcFusionTensor t_y{ + y_ptr, + nullptr, + y.dims()[0], + y.dims()[1], + y.dims()[1], + false, + }; + xblas::FcFusionTensor t_out{ + out_ptr, + nullptr, + out->dims()[0], + out->dims()[1], + out->dims()[1], + false, + }; + xblas::FcFusionDesc desc{ + alpha, + beta, + }; + xblas::FcFusionEpilogue epilogue{ + xdnn::Activation_t::LINEAR, + nullptr, + nullptr, + nullptr, + 0, + 0, + nullptr, + }; + r = xblas::fc_fusion( + dev_ctx.x_context(), t_x, t_y, t_input, t_out, desc, epilogue); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion"); +#else + } else { + Copy(dev_ctx, input, dev_ctx.GetPlace(), false, out); + XpuFcInfo fc_info; + GetFCInfo(x_dims, y_dims, false, false, &fc_info); + MatMulXPUFunction( + dev_ctx.x_context(), x_ptr, y_ptr, out_ptr, fc_info, alpha, beta); +#endif + } +} +} // namespace phi + +PD_REGISTER_KERNEL(addmm, + XPU, + ALL_LAYOUT, + phi::AddmmKernel, + float, + phi::dtype::bfloat16, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/fused_attention_grad_kernel.cc b/paddle/phi/kernels/xpu/fused_attention_grad_kernel.cc index fe989318cbcb4..3064a590db136 100644 --- a/paddle/phi/kernels/xpu/fused_attention_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/fused_attention_grad_kernel.cc @@ -321,10 +321,10 @@ void FusedAttentionGradKernel( std::tie(info_dfmha, info_dlinear_w, a_1, b_1, a_2, b_2) = fc_info; phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_dlinear_w, 1.0f, true); + xpu_ctx, a_2, b_2, c_2, info_dlinear_w, 1.0f, 0.f, true); phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_dfmha, 1.0f, true); + xpu_ctx, a_1, b_1, c_1, info_dfmha, 1.0f, 0.f, true); // dlinear_bias r = xpu::reduce_sum(xpu_ctx, @@ -385,9 +385,9 @@ void FusedAttentionGradKernel( std::tie(info_d_qk, info_d_v, a_1, b_1, a_2, b_2) = fc_info; phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_d_qk, 1.0f, true); + xpu_ctx, a_1, b_1, c_1, info_d_qk, 1.0f, 0.f, true); phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_d_v, 1.0f, true); + xpu_ctx, a_2, b_2, c_2, info_d_v, 1.0f, 0.f, true); DropoutGrad(xpu_ctx, d_qk_ptr, @@ -443,10 +443,10 @@ void FusedAttentionGradKernel( std::tie(info_d_q, info_d_k, a_1, b_1, a_2, b_2) = fc_info; phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_d_q, 1.0f / sqrt(head_dims), true); + xpu_ctx, a_1, b_1, c_1, info_d_q, 1.0f / sqrt(head_dims), 0.f, true); phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_d_k, 1.0f, true); + xpu_ctx, a_2, b_2, c_2, info_d_k, 1.0f, 0.f, true); } // @@ -491,9 +491,9 @@ void FusedAttentionGradKernel( std::tie(info_d_x, info_d_qkv_w, a_1, b_1, a_2, b_2) = fc_info; phi::MatMulXPUFunction( - xpu_ctx, a_1, b_1, c_1, info_d_x, 1.0f, true); + xpu_ctx, a_1, b_1, c_1, info_d_x, 1.0f, 0.f, true); phi::MatMulXPUFunction( - xpu_ctx, a_2, b_2, c_2, info_d_qkv_w, 1.0f, true); + xpu_ctx, a_2, b_2, c_2, info_d_qkv_w, 1.0f, 0.f, true); // d_qkv_bias r = xpu::reduce_sum(xpu_ctx, diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h index bd3ff91b3673c..4a55be10150bf 100644 --- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h +++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h @@ -170,7 +170,6 @@ static void GetFCInfo(const phi::DDim& x_dims, if (y_dims.size() >= 3 && x_dims.size() <= 2) { info->is_x_need_broadcast = (mat_dim_b.batch_size_ > 1); } - PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_, common::errors::InvalidArgument( @@ -414,7 +413,7 @@ static void xblas_fc_batch_wrapper(xpu::Context* xpu_ctx, stride_x, reinterpret_cast(w), stride_w, - 0.0, + beta, reinterpret_cast(y), stride_y, x_maxptr, @@ -434,7 +433,7 @@ static void xblas_fc_batch_wrapper(xpu::Context* xpu_ctx, stride_x, reinterpret_cast(w), stride_w, - 0.0, + beta, reinterpret_cast(y), stride_y, x_maxptr, @@ -505,6 +504,7 @@ static void MatMulXPUFunction( T* out, const XpuFcInfo& fcinfo, float alpha, + float beta = 0.f, bool is_grad = false, xpu::Activation_t act = xpu::Activation_t::LINEAR) { using XPUType = typename XPUTypeTrait::Type; @@ -581,7 +581,7 @@ static void MatMulXPUFunction( ldy, ldout, alpha, - 0, + beta, bias, act, scale_x, @@ -626,7 +626,7 @@ static void MatMulXPUFunction( ldx, // int stride_a, y_data, // const TW* w, ldy, // int stride_b, - 0.0, // float beta, + beta, // float beta, reinterpret_cast(out), // TY* y, ldout, // int stride_c, max_x, // const float* x_maxptr, diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py index d8aae7dbe6ea1..59295d41fbd1b 100644 --- a/test/xpu/op_test_xpu.py +++ b/test/xpu/op_test_xpu.py @@ -256,7 +256,7 @@ def check_grad_with_place( a3, inputs_to_check, max_relative_error, - "Gradient Check On cpu & xpu", + "Gradient Check On xpu & cpu", ) def get_grad_with_place( diff --git a/test/xpu/test_addmm_op_xpu.py b/test/xpu/test_addmm_op_xpu.py new file mode 100644 index 0000000000000..eef876ed82953 --- /dev/null +++ b/test/xpu/test_addmm_op_xpu.py @@ -0,0 +1,296 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from get_test_cover_info import ( + XPUOpTestWrapper, + create_test_class, + get_xpu_op_support_types, +) +from op_test import convert_float_to_uint16, convert_uint16_to_float +from op_test_xpu import XPUOpTest + +import paddle +from paddle.base import core + + +class XPUTestAddMMOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = "addmm" + self.use_dynamic_create_class = False + + class TestAddMMOp(XPUOpTest): + """ + case 1 + """ + + def setUp(self): + self.op_type = "addmm" + self.dtype = self.in_type + self.init_case() + if self.dtype == np.uint16: + self.input_fp32 = np.random.random(self.input_shape).astype( + np.float32 + ) + self.x_fp32 = np.random.random(self.x_shape).astype(np.float32) + self.y_fp32 = np.random.random(self.y_shape).astype(np.float32) + self.input = convert_float_to_uint16(self.input_fp32) + self.x = convert_float_to_uint16(self.x_fp32) + self.y = convert_float_to_uint16(self.y_fp32) + dot_result = np.dot(self.x_fp32, self.y_fp32) + self.outputs = { + 'Out': convert_float_to_uint16( + self.beta + * np.broadcast_to(self.input_fp32, dot_result.shape) + + self.alpha * dot_result + ) + } + else: + self.input = np.random.random(self.input_shape).astype( + self.dtype + ) + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + dot_result = np.dot(self.x, self.y) + self.outputs = { + 'Out': self.beta + * np.broadcast_to(self.input, dot_result.shape) + + self.alpha * dot_result + } + self.inputs = { + 'Input': self.input, + 'X': self.x, + 'Y': self.y, + } + self.attrs = { + 'Alpha': self.alpha, + 'Beta': self.beta, + } + + def init_case(self): + self.input_shape = [10, 10] + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.alpha = 1.0 + self.beta = 1.0 + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + if ( + hasattr(self.__class__, "no_need_check_grad") + and self.__class__.no_need_check_grad + ): + return + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + class TestAddMMOp2(TestAddMMOp): + """ + case 2 + """ + + def init_case(self): + self.input_shape = [11, 11] + self.x_shape = [11, 13] + self.y_shape = [13, 11] + self.alpha = 1.0 + self.beta = 1.0 + + class TestAddMMOp3(TestAddMMOp): + """ + case 3 + """ + + def init_case(self): + self.input_shape = [11, 11] + self.x_shape = [11, 13] + self.y_shape = [13, 11] + self.alpha = 1.0 + self.beta = 1.0 + + class TestAddMMOp4(TestAddMMOp): + """ + case 4 + """ + + def init_case(self): + self.input_shape = [11, 13] + self.x_shape = [11, 15] + self.y_shape = [15, 13] + self.alpha = 1.0 + self.beta = 1.0 + + class TestAddMMOp5(TestAddMMOp): + """ + case 5 + """ + + def init_case(self): + self.input_shape = [11, 13] + self.x_shape = [11, 15] + self.y_shape = [15, 13] + self.alpha = 0.0 + self.beta = 1.0 + + class TestAddMMOp6(TestAddMMOp): + """ + case 6 + """ + + def init_case(self): + self.input_shape = [11, 13] + self.x_shape = [11, 15] + self.y_shape = [15, 13] + self.alpha = 1.0 + self.beta = 0.0 + + class TestAddMMOp7(TestAddMMOp): + """ + case 7 + """ + + def init_case(self): + self.input_shape = [11, 13] + self.x_shape = [11, 15] + self.y_shape = [15, 13] + self.alpha = 0.0 + self.beta = 0.0 + + class TestAddmmInputGradCheck(unittest.TestCase): + def test_check_input_grad(self): + self.init_case() + + input_np = np.random.random(self.input_shape).astype(np.float32) + x_np = np.random.random(self.x_shape).astype(np.float32) + y_np = np.random.random(self.y_shape).astype(np.float32) + + input_cpu = paddle.to_tensor( + input_np, paddle.float32, paddle.CPUPlace(), stop_gradient=False + ) + x_cpu = paddle.to_tensor( + x_np, paddle.float32, paddle.CPUPlace(), stop_gradient=False + ) + y_cpu = paddle.to_tensor( + y_np, paddle.float32, paddle.CPUPlace(), stop_gradient=False + ) + out = paddle.addmm( + input_cpu, x_cpu, y_cpu, beta=self.beta, alpha=self.alpha + ) + out.backward() + + xpu_version = core.get_xpu_device_version(0) + if xpu_version == core.XPUVersion.XPU3: + test_dtypes = [paddle.float32, paddle.float16, paddle.bfloat16] + else: + test_dtypes = [paddle.float32, paddle.float16] + atol = 0.001 + rtol = 1e-5 + for test_dtype in test_dtypes: + input_xpu = paddle.to_tensor( + input_np, + test_dtype, + paddle.XPUPlace(0), + stop_gradient=False, + ) + x_xpu = paddle.to_tensor( + x_np, test_dtype, paddle.XPUPlace(0), stop_gradient=False + ) + y_xpu = paddle.to_tensor( + y_np, test_dtype, paddle.XPUPlace(0), stop_gradient=False + ) + if test_dtype == paddle.bfloat16: + input_np_bf16 = convert_float_to_uint16(input_np) + x_np_bf16 = convert_float_to_uint16(x_np) + y_np_bf16 = convert_float_to_uint16(y_np) + input_xpu = paddle.to_tensor( + input_np_bf16, + test_dtype, + paddle.XPUPlace(0), + stop_gradient=False, + ) + x_xpu = paddle.to_tensor( + x_np_bf16, + test_dtype, + paddle.XPUPlace(0), + stop_gradient=False, + ) + y_xpu = paddle.to_tensor( + y_np_bf16, + test_dtype, + paddle.device.XPUPlace(0), + stop_gradient=False, + ) + out_xpu = paddle.addmm( + input_xpu, x_xpu, y_xpu, beta=self.beta, alpha=self.alpha + ) + out_xpu.backward() + + if test_dtype == paddle.bfloat16: + np.testing.assert_allclose( + input_cpu.grad.numpy(), + convert_uint16_to_float(input_xpu.grad.numpy()), + rtol=rtol, + atol=atol, + ) + else: + np.testing.assert_allclose( + input_cpu.grad.numpy(), + input_xpu.grad.numpy(), + rtol=rtol, + atol=atol, + ) + + def init_case(self): + self.input_shape = [11, 11] + self.x_shape = [11, 13] + self.y_shape = [13, 11] + self.alpha = 1.0 + self.beta = 0.0 + + class TestAddmmInputGradCheck1(TestAddmmInputGradCheck): + def init_case(self): + self.input_shape = [10, 10] + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.alpha = 0.0 + self.beta = 1.0 + + class TestAddmmInputGradCheck2(TestAddmmInputGradCheck): + def init_case(self): + self.input_shape = [10, 10] + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.alpha = 0.0 + self.beta = 0.0 + + class TestAddmmInputGradCheck3(TestAddmmInputGradCheck): + def init_case(self): + self.input_shape = [10, 10] + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.alpha = 1.0 + self.beta = 1.0 + + +support_types = get_xpu_op_support_types('addmm') +for stype in support_types: + create_test_class(globals(), XPUTestAddMMOp, stype) + +if __name__ == "__main__": + unittest.main() From 0b6c47e6b59155f7e2c4e472eac35b6f6d1c46dd Mon Sep 17 00:00:00 2001 From: cubehan3 Date: Thu, 10 Oct 2024 18:47:20 +0800 Subject: [PATCH 066/135] [Prim][PIR] Polish cast from fp16 to fp32 code style (fixed) (#68575) * polish code style for half-precision cast * fixed bugs --- paddle/fluid/primitive/rule/vjp/details.h | 162 ++++++---------------- 1 file changed, 41 insertions(+), 121 deletions(-) diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index 1dcb5141a8331..bedf08ff6e1c0 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -897,13 +897,10 @@ void layer_norm_grad(const Tensor& x, } // cast dtype to float32 if dtype =float16 or bfloat16 - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - x_cast = cast(x_cast, phi::DataType::FLOAT32); - out_grad_cast = cast(out_grad_cast, phi::DataType::FLOAT32); - if (scale_ptr) { - scale_cast = cast(scale_cast, phi::DataType::FLOAT32); - } + x_cast = ConverToMT(x_cast); + out_grad_cast = ConverToMT(out_grad_cast); + if (scale_ptr) { + scale_cast = ConverToMT(scale_cast); } auto x_sub_mean = x_cast - mean_; // M,N @@ -929,11 +926,8 @@ void layer_norm_grad(const Tensor& x, auto d_mean_d_std = (1.0 / shape_2) * (d_mean + d_std); auto x_grad_tmp = dx_end - d_mean_d_std; x_grad_tmp = reshape(x_grad_tmp, common::vectorize(x.dims())); + x_grad_tmp = ConverToOrig(x_grad_tmp, x.dtype()); - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - x_grad_tmp = cast(x_grad_tmp, x.dtype()); - } set_output(x_grad_tmp, x_grad); } @@ -943,10 +937,8 @@ void layer_norm_grad(const Tensor& x, (x_sub_mean_mul_sqrt_var_1 * out_grad_cast) .sum(std::vector({0}), x_cast.dtype(), true); scale_grad_tmp = reshape(scale_grad_tmp, scale_ptr->shape()); - if (scale_ptr->dtype() == phi::DataType::FLOAT16 || - scale_ptr->dtype() == phi::DataType::BFLOAT16) { - scale_grad_tmp = cast(scale_grad_tmp, scale_ptr->dtype()); - } + scale_grad_tmp = ConverToOrig(scale_grad_tmp, scale_ptr->dtype()); + set_output(scale_grad_tmp, scale_grad); } else { scale_grad = nullptr; @@ -958,10 +950,8 @@ void layer_norm_grad(const Tensor& x, auto bias_grad_tmp = out_grad_cast.sum(std::vector({0}), x_cast.dtype(), true); bias_grad_tmp = reshape(bias_grad_tmp, bias_ptr->shape()); - if (bias_ptr->dtype() == phi::DataType::FLOAT16 || - bias_ptr->dtype() == phi::DataType::BFLOAT16) { - bias_grad_tmp = cast(bias_grad_tmp, bias_ptr->dtype()); - } + bias_grad_tmp = ConverToOrig(bias_grad_tmp, bias_ptr->dtype()); + set_output(bias_grad_tmp, bias_grad); } else { bias_grad = nullptr; @@ -1067,15 +1057,11 @@ void square_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) { template void exp_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) { if (x_grad) { - if (out.dtype() == phi::DataType::FLOAT16 || - out.dtype() == phi::DataType::BFLOAT16) { - Tensor out_promote = cast(out, phi::DataType::FLOAT32); - Tensor out_grad_promote = cast(out_grad, phi::DataType::FLOAT32); - set_output(cast(out_promote * out_grad_promote, out.dtype()), - x_grad); - } else { - set_output(out_grad * out, x_grad); - } + Tensor out_promote = ConverToMT(out); + Tensor out_grad_promote = ConverToMT(out_grad); + + auto x_grad_tmp = out_promote * out_grad_promote; + set_output(ConverToOrig(x_grad_tmp, out.dtype()), x_grad); } } @@ -1106,19 +1092,12 @@ void silu_grad(const Tensor& x, Tensor* x_grad) { if (x_grad) { auto one = full_scalar(1.0, x.dtype()); - auto org_dtype = x.dtype(); - bool need_cast = org_dtype == phi::DataType::FLOAT16 || - org_dtype == phi::DataType::BFLOAT16; - if (need_cast) { - auto x_cast = cast(x, phi::DataType::FLOAT32); - auto out_cast = cast(out, phi::DataType::FLOAT32); - auto out_grad_cast = cast(out_grad, phi::DataType::FLOAT32); - auto res = out_grad_cast * sigmoid(x_cast) * (one + x_cast - out_cast); - set_output(cast(res, org_dtype), x_grad); - } else { - auto res = out_grad * sigmoid(x) * (one + x - out); - set_output(res, x_grad); - } + + auto x_cast = ConverToMT(x); + auto out_cast = ConverToMT(out); + auto out_grad_cast = ConverToMT(out_grad); + auto res = out_grad_cast * sigmoid(x_cast) * (one + x_cast - out_cast); + set_output(ConverToOrig(res, x.dtype()), x_grad); } } @@ -1484,24 +1463,14 @@ void instance_norm_grad(const Tensor& x, const int h = x.dims()[2]; const int w = x.dims()[3]; - auto promoted_y_grad = y_grad; - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - promoted_y_grad = cast(y_grad, phi::DataType::FLOAT32); - } + auto promoted_y_grad = ConverToMT(y_grad); Tensor x_hat; Tensor std_inv; if (scale_grad || x_grad) { - auto promoted_x = x; - auto promoted_saved_mean = saved_mean; - auto promoted_saved_var = saved_variance; - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - promoted_x = cast(x, phi::DataType::FLOAT32); - promoted_saved_mean = cast(saved_mean, phi::DataType::FLOAT32); - promoted_saved_var = cast(saved_variance, phi::DataType::FLOAT32); - } + auto promoted_x = ConverToMT(x); + auto promoted_saved_mean = ConverToMT(saved_mean); + auto promoted_saved_var = ConverToMT(saved_variance); auto mean = reshape(promoted_saved_mean, IntArray({n, c, 1, 1})) .tile(IntArray({1, 1, h, w})); std_inv = reshape(promoted_saved_var, IntArray({n, c, 1, 1})) @@ -1517,11 +1486,7 @@ void instance_norm_grad(const Tensor& x, : full(IntArray({c}), 1., x.dtype()), IntArray({1, c, 1, 1})) .tile(IntArray({n, 1, h, w})); - auto promoted_scale = scale_data; - if (scale_data.dtype() == phi::DataType::FLOAT16 || - scale_data.dtype() == phi::DataType::BFLOAT16) { - promoted_scale = cast(scale_data, phi::DataType::FLOAT32); - } + auto promoted_scale = ConverToMT(scale_data); auto result = (promoted_scale * std_inv) * (promoted_y_grad - @@ -1530,34 +1495,19 @@ void instance_norm_grad(const Tensor& x, (x_hat * ((promoted_y_grad * x_hat) .sum(IntArray({2, 3}), promoted_y_grad.dtype(), true) / (h * w)))); - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - set_output(cast(result, x.dtype()), x_grad); - } else { - set_output(result, x_grad); - } + set_output(ConverToOrig(result, x.dtype()), x_grad); } // scale_grad = x_hat * y_grad.sum(n, h, w) if (scale_grad) { auto result = (promoted_y_grad * x_hat).sum(IntArray({0, 2, 3})); auto scale_dtype = scale.get_ptr() ? scale.get().dtype() : x.dtype(); - if (scale_dtype == phi::DataType::FLOAT16 || - scale_dtype == phi::DataType::BFLOAT16) { - set_output(cast(result, scale_dtype), scale_grad); - } else { - set_output(result, scale_grad); - } + set_output(ConverToOrig(result, scale_dtype), scale_grad); } // d_bias = y_grad.sum(n, h, w) if (bias_grad) { auto result = promoted_y_grad.sum(IntArray({0, 2, 3})); auto scale_dtype = scale.get_ptr() ? scale.get().dtype() : x.dtype(); - if (scale_dtype == phi::DataType::FLOAT16 || - scale_dtype == phi::DataType::BFLOAT16) { - set_output(cast(result, scale_dtype), bias_grad); - } else { - set_output(result, bias_grad); - } + set_output(ConverToOrig(result, scale_dtype), bias_grad); } } @@ -1895,8 +1845,8 @@ void batch_norm_grad(const Tensor& x, DataLayout data_layout_ = common::StringToDataLayout(data_layout); - Tensor x_data = x; - Tensor out_grad_data = out_grad; + Tensor x_data = ConverToMT(x); + Tensor out_grad_data = ConverToMT(out_grad); auto run_var = variance_out.get(); auto run_mean = mean_out.get(); Tensor mean_data; @@ -1905,15 +1855,6 @@ void batch_norm_grad(const Tensor& x, std::vector nhwc_to_nchw_dim = {0, 3, 1, 2}; auto reduce_axis = IntArray(std::vector{0, 1, 2}); - bool need_cast = x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16; - if (need_cast) { - x_data = cast(x, phi::DataType::FLOAT32); - } - if (out_grad.dtype() == phi::DataType::FLOAT16 || - out_grad.dtype() == phi::DataType::BFLOAT16) { - out_grad_data = cast(out_grad, phi::DataType::FLOAT32); - } if (x_data.dims().size() == 2 && data_layout_ == DataLayout::kNCHW) { data_layout_ = DataLayout::kNHWC; } @@ -1966,9 +1907,8 @@ void batch_norm_grad(const Tensor& x, nhwc_x_grad = scale.get() * nhwc_x_grad; } auto nchw_x_grad = transpose(nhwc_x_grad, nhwc_to_nchw_dim); - if (need_cast) { - nchw_x_grad = cast(nchw_x_grad, x.dtype()); - } + nchw_x_grad = ConverToOrig(nchw_x_grad, x.dtype()); + set_output(nchw_x_grad, x_grad); } else { auto part1 = rsqrt_var; @@ -2009,9 +1949,7 @@ void batch_norm_grad(const Tensor& x, auto x_grad_data = part1 * part2; auto nchw_x_grad = transpose(x_grad_data, nhwc_to_nchw_dim); - if (need_cast) { - nchw_x_grad = cast(nchw_x_grad, x.dtype()); - } + nchw_x_grad = ConverToOrig(nchw_x_grad, x.dtype()); set_output(nchw_x_grad, x_grad); } } @@ -2035,9 +1973,7 @@ void batch_norm_grad(const Tensor& x, if (scale) { x_grad_data = scale.get() * x_grad_data; } - if (need_cast) { - x_grad_data = cast(x_grad_data, x.dtype()); - } + x_grad_data = ConverToOrig(x_grad_data, x.dtype()); set_output(x_grad_data, x_grad); } else { auto part1 = rsqrt_var; @@ -2078,9 +2014,8 @@ void batch_norm_grad(const Tensor& x, out_grad_data - mean_temp1 - (x_data - mean_data) * mean_temp2; auto x_grad_data = part1 * part2; - if (need_cast) { - x_grad_data = cast(x_grad_data, x.dtype()); - } + x_grad_data = ConverToOrig(x_grad_data, x.dtype()); + set_output(x_grad_data, x_grad); } if (scale_grad) { @@ -2422,18 +2357,8 @@ void group_norm_grad(const Tensor& x, int g_num = C / groups; - Tensor x_data = x; - Tensor out_grad_data = out_grad; - - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - x_data = cast(x, phi::DataType::FLOAT32); - } - - if (out_grad.dtype() == phi::DataType::FLOAT16 || - out_grad.dtype() == phi::DataType::BFLOAT16) { - out_grad_data = cast(out_grad, phi::DataType::FLOAT32); - } + Tensor x_data = ConverToMT(x); + Tensor out_grad_data = ConverToMT(out_grad); auto shape_group = std::vector({N, groups, g_num}); @@ -2467,10 +2392,8 @@ void group_norm_grad(const Tensor& x, Tensor d2; Tensor p1; if (scale) { - if (scale_data.dtype() == phi::DataType::FLOAT16 || - scale_data.dtype() == phi::DataType::BFLOAT16) { - scale_data = cast(scale_data, phi::DataType::FLOAT32); - } + scale_data = ConverToMT(scale_data); + d1 = (reshape(sum_y_grad_mul_x * scale_data, shape_group)) .sum(std::vector({2}), dtype, false); d2 = (reshape(sum_y_grad * scale_data, shape_group)) @@ -2504,10 +2427,7 @@ void group_norm_grad(const Tensor& x, auto tmp_2 = reshape(x_data, whole_group_shape) * p2 + p3; auto x_grad_data = tmp_1 + tmp_2; x_grad_data = reshape(x_grad_data, x.shape()); - if (x.dtype() == phi::DataType::FLOAT16 || - x.dtype() == phi::DataType::BFLOAT16) { - x_grad_data = cast(x_grad_data, x.dtype()); - } + x_grad_data = ConverToOrig(x_grad_data, x.dtype()); set_output(x_grad_data, x_grad); } From cbc9380c7180b6e501eba6e25d8d47ae8752ec8f Mon Sep 17 00:00:00 2001 From: Zhaowu Pan Date: Thu, 10 Oct 2024 21:55:32 +0800 Subject: [PATCH 067/135] [PHI] Fix performance issue in bilinear interpolation's backward kernel. (#68541) * [PHI] Fix performance issue in bilinear interpolation's backward kernel. * Fix CPPLINT mis-warning issue. * Fixed implicit constant datatype issue. --- .../kernels/gpu/interpolate_grad_kernel.cu | 152 +++++++++++++----- 1 file changed, 114 insertions(+), 38 deletions(-) diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu index ebac0069eebaf..c7eb0a55eca65 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -35,12 +35,13 @@ __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( T* lambda2, T src_x, const int in_img_x) { - src_x = (src_x > static_cast(0)) ? src_x : static_cast(0); - *in_img_idx = static_cast(src_x); - *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; - using MT = typename phi::dtype::MPTypeTrait::Type; - *lambda1 = static_cast(static_cast(src_x) - *in_img_idx); - *lambda2 = static_cast(1.0) - *lambda1; + src_x = max(src_x, static_cast(0)); + T src_x_floor = floorf(src_x); + T frac_part = src_x - src_x_floor; + *lambda1 = frac_part; + *lambda2 = static_cast(1) - frac_part; + *in_img_idx = static_cast(src_x_floor); + *x_id = (*in_img_idx < in_img_x - 1); } template @@ -360,42 +361,117 @@ __global__ void KeBilinearInterpNCHWBw(T* in, const T* __restrict__ out, const float align_type_value) { int index = threadIdx.x + blockDim.x * blockIdx.x; - int stride = blockDim.x * gridDim.x; - int num_out = n * num_channels * out_h * out_w; - int num_in = n * num_channels * in_h * in_w; + const int stride = blockDim.x * gridDim.x; + const int num_out = n * num_channels * out_h * out_w; + const int num_in = n * num_channels * in_h * in_w; using MT = typename phi::dtype::MPTypeTrait::Type; - for (; index < num_out; index += stride) { - int index_tmp = index; - int w2 = index_tmp % out_w; - index_tmp /= out_w; - int h2 = index_tmp % out_h; - int nc = index_tmp / out_h; + // Restricted parallelism if ratio_w is over threshold + // to avoid atomic contention overhead. + // This threshold 0.5f is come up with extensive quantitative analysis, + // corresponding to 2x or larger scale factor in W axis. + if (ratio_w < 0.5f) [[likely]] { // NOLINT + if (index < num_in) { + int index_tmp = index; + const int w1 = index_tmp % in_w; + index_tmp /= in_w; + const int h1 = index_tmp % in_h; + const int nc = index_tmp / in_h; + + MT d2val_sum = 0.0f; + + // Precompute constants + const MT inv_ratio_h = 1.0f / ratio_h; + const MT inv_ratio_w = 1.0f / ratio_w; + + // Compute the range of output pixels (h2_min, h2_max) that could affect + // input pixel h1 + const MT h2r_min = + (h1 - 1 + align_type_value) * inv_ratio_h - align_type_value; + const int h2_min = max(static_cast(ceilf(h2r_min)), 0); + + const MT h2r_max = + (h1 + 1 + align_type_value) * inv_ratio_h - align_type_value; + const int h2_max = min(static_cast(floorf(h2r_max)), out_h - 1); + + // Compute the range of output pixels (w2_min, w2_max) that could affect + // input pixel w1 + const MT w2r_min = + (w1 - 1 + align_type_value) * inv_ratio_w - align_type_value; + const int w2_min = max(static_cast(ceilf(w2r_min)), 0); + + const MT w2r_max = + (w1 + 1 + align_type_value) * inv_ratio_w - align_type_value; + const int w2_max = min(static_cast(floorf(w2r_max)), out_w - 1); + + for (int h2 = h2_min; h2 <= h2_max; ++h2) { + const MT src_y = ratio_h * (h2 + align_type_value) - align_type_value; + int h1_, y_id; + MT h1lambda, h0lambda; + PreCalculatorForLinearInterpInputIndex( + &h1_, &y_id, &h1lambda, &h0lambda, src_y, in_h); + + if (h1 != h1_ && h1 != h1_ + y_id) [[unlikely]] { + continue; + } - int h1, y_id; - MT h1lambda, h0lambda; - MT src_y = - static_cast(ratio_h * (h2 + align_type_value) - align_type_value); + for (int w2 = w2_min; w2 <= w2_max; ++w2) { + int w1_, x_id; + const MT src_x = ratio_w * (w2 + align_type_value) - align_type_value; + MT w1lambda, w0lambda; + PreCalculatorForLinearInterpInputIndex( + &w1_, &x_id, &w1lambda, &w0lambda, src_x, in_w); + if (w1 != w1_ && w1 != w1_ + x_id) [[unlikely]] { + continue; + } - PreCalculatorForLinearInterpInputIndex( - &h1, &y_id, &h1lambda, &h0lambda, src_y, in_h); - int w1, x_id; - MT w1lambda, w0lambda; - MT src_x = - static_cast(ratio_w * (w2 + align_type_value) - align_type_value); - PreCalculatorForLinearInterpInputIndex( - &w1, &x_id, &w1lambda, &w0lambda, src_x, in_w); - - MT d2val = static_cast(out[index]); - - phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), - static_cast(h0lambda * w0lambda * d2val)); - phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), - static_cast(h0lambda * w1lambda * d2val)); - phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), - static_cast(h1lambda * w0lambda * d2val)); - phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), - static_cast(h1lambda * w1lambda * d2val)); + const MT grad_output = out[nc * out_h * out_w + h2 * out_w + w2]; + + float hlambda = (h1 == h1_) ? h0lambda : 0.0f; + hlambda += (h1 == h1_ + y_id) ? h1lambda : 0.0f; + + float wlambda = (w1 == w1_) ? w0lambda : 0.0f; + wlambda += (w1 == w1_ + x_id) ? w1lambda : 0.0f; + + d2val_sum += hlambda * wlambda * grad_output; + } + } + in[index] = static_cast(d2val_sum); + } + } else [[unlikely]] { // NOLINT + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + MT h1lambda, h0lambda; + MT src_y = + static_cast(ratio_h * (h2 + align_type_value) - align_type_value); + + PreCalculatorForLinearInterpInputIndex( + &h1, &y_id, &h1lambda, &h0lambda, src_y, in_h); + int w1, x_id; + MT w1lambda, w0lambda; + MT src_x = + static_cast(ratio_w * (w2 + align_type_value) - align_type_value); + PreCalculatorForLinearInterpInputIndex( + &w1, &x_id, &w1lambda, &w0lambda, src_x, in_w); + + MT d2val = static_cast(out[index]); + + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + static_cast(h0lambda * w0lambda * d2val)); + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + static_cast(h0lambda * w1lambda * d2val)); + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + static_cast(h1lambda * w0lambda * d2val)); + phi::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + static_cast(h1lambda * w1lambda * d2val)); + } } } From 3865ce6ea3a3e3da9937881bd3d89d8f2bab854d Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 11 Oct 2024 00:07:19 +0800 Subject: [PATCH 068/135] =?UTF-8?q?[Typing]=20=E7=94=9F=E6=88=90=20`tensor?= =?UTF-8?q?.pyi`=20=E6=97=B6=E6=B7=BB=E5=8A=A0=20`overload`=20=E7=9A=84?= =?UTF-8?q?=E6=96=B9=E6=B3=95=20(#68598)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../base/dygraph/tensor_patch_methods.py | 3 +- python/paddle/tensor/linalg.py | 4 +- python/paddle/tensor/manipulation.py | 3 +- python/paddle/tensor/search.py | 3 +- python/paddle/tensor/stat.py | 4 +- tools/gen_tensor_stub.py | 40 +++++++++++++++---- 6 files changed, 43 insertions(+), 14 deletions(-) diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py index 4032597e82f07..2e4c75c093911 100644 --- a/python/paddle/base/dygraph/tensor_patch_methods.py +++ b/python/paddle/base/dygraph/tensor_patch_methods.py @@ -17,10 +17,11 @@ import hashlib import inspect import warnings -from typing import TYPE_CHECKING, Any, Callable, overload +from typing import TYPE_CHECKING, Any, Callable import numpy as np import numpy.typing as npt +from typing_extensions import overload import paddle from paddle import _C_ops, profiler diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index a09a6b6c7b2ce..669feb60a6096 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -13,10 +13,10 @@ # limitations under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Literal, overload +from typing import TYPE_CHECKING, Literal import numpy as np -from typing_extensions import TypeAlias +from typing_extensions import TypeAlias, overload import paddle from paddle import _C_ops diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index ed6390727b8a0..5e77930b02669 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -16,9 +16,10 @@ import functools import math -from typing import TYPE_CHECKING, Any, Literal, overload +from typing import TYPE_CHECKING, Any, Literal import numpy as np +from typing_extensions import overload import paddle from paddle import _C_ops diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index c9a9e065d97ca..58a20c37661cf 100755 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -14,9 +14,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, overload +from typing import TYPE_CHECKING, Literal import numpy as np +from typing_extensions import overload import paddle from paddle import _C_ops diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index cc01f2f3f3d08..1580a28b20ea0 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -14,9 +14,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, overload +from typing import TYPE_CHECKING, Literal -from typing_extensions import TypeAlias +from typing_extensions import TypeAlias, overload import paddle from paddle import _C_ops diff --git a/tools/gen_tensor_stub.py b/tools/gen_tensor_stub.py index 6a25f136507de..8d55efb1f34e8 100644 --- a/tools/gen_tensor_stub.py +++ b/tools/gen_tensor_stub.py @@ -25,7 +25,7 @@ from functools import cached_property, lru_cache from typing import TYPE_CHECKING, Any, Callable, Literal, Protocol -from typing_extensions import TypeAlias +from typing_extensions import TypeAlias, get_overloads if TYPE_CHECKING: from types import ModuleType @@ -221,7 +221,9 @@ def add_method(self, func: Member): method_code += f"@{decorator}\n" method_code += f"def {func.signature}:\n" - if func.doc: + # do NOT insert docs from overload methods, + # because we always add a plain method + if func.doc and func.decorators != ["overload"]: method_code += f'{INDENT}r"""\n' method_code += with_indent(func.doc, 1) method_code += "\n" @@ -506,11 +508,34 @@ def get_tensor_members(module: str = 'paddle.Tensor') -> dict[int, Member]: member_signature, member_doc_cleaned, ) - elif ( - inspect.isfunction(member) - or inspect.ismethod(member) - or inspect.ismethoddescriptor(member) - ): + elif inspect.isfunction(member) or inspect.ismethod(member): + # `all_signatures`: list[[member id, decorators, signature]] + # with atleast an original method + all_signatures = [[member_id, [], member_signature]] + + # try to get overloads + _overloads = get_overloads(member) + for f in _overloads: + _sig = inspect.signature(f) + all_signatures.append( + [ + id(f), + ["overload"], + f"{name}{_sig}".replace("Ellipsis", "..."), + ] + ) + + for _member_id, _decorators, _sig in all_signatures: + members[_member_id] = Member( + _member_id, + name, + "method", + [], + _decorators, + func_sig_to_method_sig(_sig), + member_doc_cleaned, + ) + elif inspect.ismethoddescriptor(member): members[member_id] = Member( member_id, name, @@ -522,6 +547,7 @@ def get_tensor_members(module: str = 'paddle.Tensor') -> dict[int, Member]: ) else: logging.debug(f"Skip unknown type of member: {name}, {member}") + return members From d8211352fedb0e80a490cc430893a67c323d5564 Mon Sep 17 00:00:00 2001 From: crazyxiaoxi <113622186+crazyxiaoxi@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:12:54 +0800 Subject: [PATCH 069/135] =?UTF-8?q?[CINN]=20=E3=80=90Infer=20Symbolic=20Sh?= =?UTF-8?q?ape=20BUAA=20=E3=80=91Add=20fused=5Ffeedfoward=20op=20=20(#6818?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * first first * fix F f * fix * isFaleValue * is fakevalue * comment --- .../multiary_infer_sym.cc | 127 +++++++++++++++++- .../infer_symbolic_shape/multiary_infer_sym.h | 2 +- .../phi/ops/yaml/inconsistent/static_ops.yaml | 2 +- 3 files changed, 123 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 748fa8a252ed9..c9138af11ff60 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -1542,12 +1542,127 @@ bool FlashAttnOpInferSymbolicShape( return true; } -// bool FusedFeedforwardOpInferSymbolicShape(pir::Operation *op, -// pir::InferSymbolicShapeContext -// *infer_context) { -// // pass -// return true; -// } +bool FusedFeedforwardOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &x_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + std::vector x_shape = x_shape_or_data.shape(); + + auto RowMatrixFromVector = [](const std::vector &x_shape) + -> std::vector { + if (x_shape.size() > 1) { + return x_shape; + } + return {symbol::DimExpr(1), x_shape[0]}; + }; + std::vector tensor_dim = RowMatrixFromVector(x_shape); + symbol::DimExpr height_; + symbol::DimExpr width_; + symbol::DimExpr stride_; + symbol::DimExpr batch_size_ = 1; + + if (tensor_dim.size() == 2) { + height_ = tensor_dim[0]; + width_ = tensor_dim[1]; + } else { + std::vector dim_vec = tensor_dim; + for (size_t i = 0; i < dim_vec.size() - 2; ++i) { + batch_size_ = batch_size_ * dim_vec[i]; + } + height_ = dim_vec[dim_vec.size() - 2]; + width_ = dim_vec[dim_vec.size() - 1]; + stride_ = height_ * width_; + } + + const auto &linear1_weight_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(3)); + std::vector linear1_weight_dims = + linear1_weight_shape_or_data.shape(); + + std::vector tmp_dim_x = x_shape; + tmp_dim_x.back() = linear1_weight_dims.back(); + + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(x_shape)}); + + bool is_test = op->attribute("is_test").data(); + + if (!is_test) { + infer_context->SetShapeOrDataForValue( + op->result(1), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(tmp_dim_x)}); + } + infer_context->SetShapeOrDataForValue( + op->result(9), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(tmp_dim_x)}); + infer_context->SetShapeOrDataForValue( + op->result(7), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(tmp_dim_x)}); + infer_context->SetShapeOrDataForValue( + op->result(10), + symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(x_shape)}); + + if (!is_test) { + infer_context->SetShapeOrDataForValue( + op->result(2), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(x_shape)}); + } + + std::vector mean_dim = + std::vector{batch_size_ * height_}; + + bool pre_layer_norm = + op->attribute("pre_layer_norm").data(); + + if (pre_layer_norm) { + if (!paddle::dialect::details::IsFakeValue(op->result(8))) { + infer_context->SetShapeOrDataForValue( + op->result(8), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(x_shape)}); + } + + if (!paddle::dialect::details::IsFakeValue(op->result(3))) { + infer_context->SetShapeOrDataForValue( + op->result(3), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(mean_dim)}); + } + + if (!paddle::dialect::details::IsFakeValue(op->result(4))) { + infer_context->SetShapeOrDataForValue( + op->result(4), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(mean_dim)}); + } + + } else { + if (!paddle::dialect::details::IsFakeValue(op->result(5))) { + infer_context->SetShapeOrDataForValue( + op->result(5), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(mean_dim)}); + } + + if (!paddle::dialect::details::IsFakeValue(op->result(6))) { + infer_context->SetShapeOrDataForValue( + op->result(6), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(mean_dim)}); + } + } + + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(x_shape)}); + + return true; +} // bool FusedAttentionOpInferSymbolicShape(pir::Operation *op, // pir::InferSymbolicShapeContext diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h index a67cf5cbf1551..28e9e1861ed31 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h @@ -53,7 +53,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(FakeQuantizeRangeAbsMax) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FakeQuantizeRangeAbsMax_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FullWithTensor) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttn) -// OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedFeedforward) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedFeedforward) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedAttention) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnQkvpacked) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnUnpadded) diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index b427ae205d970..639f904516963 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -1048,7 +1048,7 @@ func: FusedFeedForwardInferMeta optional: dropout1_seed, dropout2_seed, linear1_bias, linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias, ln2_mean, ln2_variance, ln1_mean, ln1_variance, ln1_out backward: fused_feedforward_grad - # interfaces : paddle::dialect::InferSymbolicShapeInterface + interfaces : paddle::dialect::InferSymbolicShapeInterface - op: moving_average_abs_max_scale args: (Tensor x, Tensor in_accum, Tensor in_state, float moving_rate=0.9f, bool is_test=false) From 679a1f0ecc33240145735973c32e69877da9e786 Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:23:17 +0800 Subject: [PATCH 070/135] =?UTF-8?q?=E3=80=90CINN=E3=80=91Add=20IndexExpr?= =?UTF-8?q?=20(#68617)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add index_expr * remove IterExpr. --- paddle/cinn/ir/ir.cc | 111 +++++++++++++++++++++++ paddle/cinn/ir/ir.h | 26 ++++++ paddle/cinn/ir/ir_base.cc | 35 +++++++ paddle/cinn/ir/ir_base.h | 6 ++ paddle/cinn/ir/ir_visitor.cc | 7 ++ paddle/cinn/ir/ir_visitor.h | 3 + test/cpp/pir/cinn/adt/CMakeLists.txt | 1 + test/cpp/pir/cinn/adt/Index_expr_test.cc | 43 +++++++++ 8 files changed, 232 insertions(+) create mode 100644 test/cpp/pir/cinn/adt/Index_expr_test.cc diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc index c721838dacf6a..94158699ae93a 100644 --- a/paddle/cinn/ir/ir.cc +++ b/paddle/cinn/ir/ir.cc @@ -15,6 +15,7 @@ #include "paddle/cinn/ir/ir.h" #include +#include #include #include #include "paddle/cinn/common/cinn_value.h" @@ -1516,5 +1517,115 @@ void Block::Verify() const {} void PrimitiveNode::Verify() const {} +IndexExpr &IndexExpr::operator=(const IndexExpr &other) { + *static_cast(this) = *static_cast(&other); + return *this; +} + +static std::optional SimplifyAdd(IndexExpr lhs, IndexExpr rhs) { + auto lhsConst = lhs.As(); + auto rhsConst = rhs.As(); + if (lhsConst && rhsConst) { + return IndexExpr(lhsConst->value + rhsConst->value); + } + + if (lhsConst && !rhsConst) { + return rhs + lhs; + } + if (rhsConst && rhsConst->value == 0) { + return lhs; + } + + return std::nullopt; +} +static std::optional SimplifySub(IndexExpr lhs, IndexExpr rhs) { + auto lhsConst = lhs.As(); + auto rhsConst = rhs.As(); + + if (lhsConst && rhsConst) { + return IndexExpr(lhsConst->value - rhsConst->value); + } + + if (rhsConst && rhsConst->value == 0) { + return lhs; + } + + return std::nullopt; +} + +static std::optional SimplifyMul(IndexExpr lhs, IndexExpr rhs) { + auto lhsConst = lhs.As(); + auto rhsConst = rhs.As(); + + if (lhsConst && rhsConst) { + return IndexExpr(lhsConst->value * rhsConst->value); + } + + if (lhsConst && !rhsConst) { + return rhs * lhs; + } + + if (rhsConst) { + if (rhsConst->value == 0) { + return IndexExpr(0); + } + if (rhsConst->value == 1) { + return lhs; + } + } + + return std::nullopt; +} + +static std::optional SimplifyDiv(IndexExpr lhs, IndexExpr rhs) { + auto lhsConst = lhs.As(); + auto rhsConst = rhs.As(); + + if (lhsConst && rhsConst) { + return IndexExpr(lhsConst->value / rhsConst->value); + } + + if (rhsConst && rhsConst->value == 1) { + return lhs; + } + + return std::nullopt; +} + +static std::optional SimplifyMod(IndexExpr lhs, IndexExpr rhs) { + auto lhsConst = lhs.As(); + auto rhsConst = rhs.As(); + + if (lhsConst && rhsConst) { + return IndexExpr(lhsConst->value % rhsConst->value); + } + + if (rhsConst && rhsConst->value == 1) { + return IndexExpr(0); + } + + return std::nullopt; +} + +#define DEFINE_BINARY_OPERATOR(op, simplifyFunc, makeFunc) \ + IndexExpr IndexExpr::operator op(int64_t v) const { \ + return *this op IndexExpr(v); \ + } \ + IndexExpr IndexExpr::operator op(int32_t v) const { \ + return *this op IndexExpr(v); \ + } \ + IndexExpr IndexExpr::operator op(IndexExpr other) const { \ + if (auto simplified = simplifyFunc(*this, other)) \ + return simplified.value(); \ + return makeFunc(*this, other); \ + } + +DEFINE_BINARY_OPERATOR(+, SimplifyAdd, Add::Make) +DEFINE_BINARY_OPERATOR(-, SimplifySub, Sub::Make) +DEFINE_BINARY_OPERATOR(*, SimplifyMul, Mul::Make) +DEFINE_BINARY_OPERATOR(/, SimplifyDiv, Div::Make) +DEFINE_BINARY_OPERATOR(%, SimplifyMod, Mod::Make) + +#undef DEFINE_BINARY_OPERATOR } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h index a7eaaf0212a00..96d5225726c1e 100644 --- a/paddle/cinn/ir/ir.h +++ b/paddle/cinn/ir/ir.h @@ -1012,6 +1012,32 @@ struct Block : public ExprNode { static const IrNodeTy _node_type_ = IrNodeTy::Block; }; +struct IndexExpr : public Expr { + public: + IndexExpr() = default; + IndexExpr(const IndexExpr& other) : Expr(other.ptr()) {} + IndexExpr(IrNode* p) : Expr(p) {} // NOLINT + IndexExpr(const Expr& expr) : Expr(expr) {} // NOLINT + + explicit IndexExpr(int32_t x) : Expr(x) {} + explicit IndexExpr(int64_t x) : Expr(x) {} + + IndexExpr& operator=(const IndexExpr& other); + +#define DEFINE_OPERATOR(op) \ + IndexExpr operator op(int64_t v) const; \ + IndexExpr operator op(int32_t v) const; \ + IndexExpr operator op(IndexExpr other) const; + + DEFINE_OPERATOR(+) + DEFINE_OPERATOR(-) + DEFINE_OPERATOR(*) + DEFINE_OPERATOR(/) + DEFINE_OPERATOR(%) + +#undef DEFINE_OPERATOR +}; + /** * \brief IterMark is a special ExprNode, which can be used to mark ther entire * ierator. source is a IterSum or iterator. extent is the extent of the diff --git a/paddle/cinn/ir/ir_base.cc b/paddle/cinn/ir/ir_base.cc index ca387e9e53643..1bc1f26058d19 100644 --- a/paddle/cinn/ir/ir_base.cc +++ b/paddle/cinn/ir/ir_base.cc @@ -247,6 +247,41 @@ double Expr::get_constant() const { bool Expr::is_var() const { return As<_Var_>(); } +bool Expr::is_index() const { + switch (node_type()) { + case ir::IrNodeTy::_Var_: + return true; + case ir::IrNodeTy::IntImm: { + if (type().is_index_type()) return true; + } + case ir::IrNodeTy::Add: + [[fallthrough]]; + case ir::IrNodeTy::Sub: + [[fallthrough]]; + case ir::IrNodeTy::Mul: + [[fallthrough]]; + case ir::IrNodeTy::Div: + [[fallthrough]]; + case ir::IrNodeTy::Mod: + return p_->operand(0).is_index() && p_->operand(1).is_index(); + } + return false; +} + +const IndexExpr Expr::as_index() const { + if (is_index()) { + return IndexExpr(*this); + } + PADDLE_THROW(::common::errors::InvalidType("Expr is not IndexExpr!")); +} + +IndexExpr Expr::as_index() { + if (is_index()) { + return IndexExpr(*this); + } + PADDLE_THROW(::common::errors::InvalidType("Expr is not IndexExpr!")); +} + _Buffer_ *Expr::as_buffer() { return As<_Buffer_>(); } const _Buffer_ *Expr::as_buffer() const { return As<_Buffer_>(); } Buffer Expr::as_buffer_ref() const { return Buffer(&Reference(as_buffer())); } diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h index b7d095979179a..48ba075929b2c 100644 --- a/paddle/cinn/ir/ir_base.h +++ b/paddle/cinn/ir/ir_base.h @@ -347,6 +347,7 @@ struct StringImm : public ExprNode { }; class Var; +class IndexExpr; /** * An expression that represents some value or the result of some operations. */ @@ -433,6 +434,11 @@ struct Expr : public IrNodeRef { bool is_var() const; + bool is_index() const; + + IndexExpr as_index(); + const IndexExpr as_index() const; + operator Var(); Type type() const { return p_->type(); } diff --git a/paddle/cinn/ir/ir_visitor.cc b/paddle/cinn/ir/ir_visitor.cc index 14542b6a9915e..83babfb60c379 100644 --- a/paddle/cinn/ir/ir_visitor.cc +++ b/paddle/cinn/ir/ir_visitor.cc @@ -28,5 +28,12 @@ bool operator==(Expr a, Expr b) { bool operator!=(Expr a, Expr b) { return !(a == b); } +bool operator==(IndexExpr a, IndexExpr b) { + if (a.get() == b.get()) return true; + return ir_utils::IRCompare(a, b); +} + +bool operator!=(IndexExpr a, IndexExpr b) { return !(a == b); } + } // namespace ir } // namespace cinn diff --git a/paddle/cinn/ir/ir_visitor.h b/paddle/cinn/ir/ir_visitor.h index 8a4e722e4b0cd..2dd8b4d6ee15e 100644 --- a/paddle/cinn/ir/ir_visitor.h +++ b/paddle/cinn/ir/ir_visitor.h @@ -102,5 +102,8 @@ struct IRVisitor : public IRVisitorRequireReImpl { bool operator==(Expr a, Expr b); bool operator!=(Expr a, Expr b); +bool operator==(IndexExpr a, IndexExpr b); +bool operator!=(IndexExpr a, IndexExpr b); + } // namespace ir } // namespace cinn diff --git a/test/cpp/pir/cinn/adt/CMakeLists.txt b/test/cpp/pir/cinn/adt/CMakeLists.txt index 818b2c0c1b186..49d5c294663ca 100644 --- a/test/cpp/pir/cinn/adt/CMakeLists.txt +++ b/test/cpp/pir/cinn/adt/CMakeLists.txt @@ -1,6 +1,7 @@ if(WITH_TESTING AND WITH_CINN) paddle_test(map_expr_test SRCS map_expr_test.cc) set_tests_properties(map_expr_test PROPERTIES LABELS "RUN_TYPE=CINN") + paddle_test(test_index_expr SRCS iter_simplify_test.cc) paddle_test(test_iter_simplify SRCS iter_simplify_test.cc) paddle_test(merge_block_utils_test SRCS merge_block_utils_test.cc) endif() diff --git a/test/cpp/pir/cinn/adt/Index_expr_test.cc b/test/cpp/pir/cinn/adt/Index_expr_test.cc new file mode 100644 index 0000000000000..02f682aa1140a --- /dev/null +++ b/test/cpp/pir/cinn/adt/Index_expr_test.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2024 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/cinn/ir/op/ir_operators.h" + +namespace cinn { +namespace common { +TEST(IndexExpr, IndexExpr_0) { + ir::IndexExpr a(14); + ir::IndexExpr b(7); + Expr d(6); + ir::Expr c0 = a + b; + ir::Expr c1 = a - b; + ir::Expr c2 = a * b; + ir::Expr c3 = a / b; + ir::Expr c4 = a % b; + + ir::Expr c5 = a / d.as_index(); + ir::Expr c6 = a % d.as_index(); + + EXPECT_EQ(c0, Expr(21)); + EXPECT_EQ(c1, Expr(7)); + EXPECT_EQ(c2, Expr(98)); + EXPECT_EQ(c3, Expr(2)); + EXPECT_EQ(c4, Expr(0)); + EXPECT_EQ(c5, Expr(2)); + EXPECT_EQ(c6, Expr(2)); +} +} // namespace common +} // namespace cinn From e898b190c8a8060d81f1bb745a35b9b7eb88cc94 Mon Sep 17 00:00:00 2001 From: Miao Zhong <156628066+Micalling@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:25:40 +0800 Subject: [PATCH 071/135] =?UTF-8?q?=E3=80=90Hackathon=207th=20No.23?= =?UTF-8?q?=E3=80=91NO.23=20=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20Para?= =?UTF-8?q?meterDict=20API=20-part=20(#68270)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/nn/__init__.py | 9 +- python/paddle/nn/layer/container.py | 103 ++++++++++++++++++ ...test_imperative_container_parameterdict.py | 90 +++++++++++++++ 3 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 test/legacy_test/test_imperative_container_parameterdict.py diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 30e3718fd1163..13343dc51dbe5 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -77,7 +77,13 @@ # TODO: import all neural network related api under this directory, # including layers, linear, conv, rnn etc. -from .layer.container import LayerDict, LayerList, ParameterList, Sequential +from .layer.container import ( + LayerDict, + LayerList, + ParameterDict, + ParameterList, + Sequential, +) from .layer.conv import ( Conv1D, Conv1DTranspose, @@ -243,6 +249,7 @@ 'TransformerEncoder', 'Softmax', 'Softmax2D', + 'ParameterDict', 'ParameterList', 'Conv2D', 'Softshrink', diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py index 9312b4014160c..e4438f3af2915 100644 --- a/python/paddle/nn/layer/container.py +++ b/python/paddle/nn/layer/container.py @@ -319,6 +319,109 @@ def update( self.add_sublayer(kv[0], kv[1]) +class ParameterDict(Layer): + """ + Holds parameters in a dictionary. + + ParameterDict can be indexed like a regular Python dictionary, but Parameters it contains are properly registered. + + Parameters: + values (iterable, optional): a mapping (dictionary) of (string : Any) or an iterable of key-value pairs of type (string, Any) + + Examples: + .. code-block:: python + + >>> import paddle + + >>> class MyLayer(paddle.nn.Layer): + ... def __init__(self, num_stacked_param): + ... super().__init__() + ... # create ParameterDict with iterable Parameters + ... self.params = paddle.nn.ParameterDict( + ... {f"t{i}": paddle.create_parameter(shape=[2, 2], dtype='float32') for i in range(num_stacked_param)}) + ... + ... def forward(self, x): + ... for i, key in enumerate(self.params): + ... x = paddle.matmul(x, self.params[key]) + ... return x + ... + >>> x = paddle.uniform(shape=[5, 2], dtype='float32') + >>> num_stacked_param = 4 + >>> model = MyLayer(num_stacked_param) + >>> print(len(model.params)) + 4 + >>> res = model(x) + >>> print(res.shape) + [5, 2] + + >>> replaced_param = paddle.create_parameter(shape=[2, 3], dtype='float32') + >>> model.params['t3'] = replaced_param # replace t3 param + >>> res = model(x) + >>> print(res.shape) + [5, 3] + >>> model.params['t4'] = paddle.create_parameter(shape=[3, 4], dtype='float32') # append param + >>> print(len(model.params)) + 5 + >>> res = model(x) + >>> print(res.shape) + [5, 4] + """ + + def __init__( + self, + parameters: ( + ParameterDict + | Mapping[str, Tensor] + | Sequence[tuple[str, Tensor]] + | None + ) = None, + ) -> None: + super().__init__() + if parameters is not None: + self.update(parameters) + + def __getitem__(self, key: str) -> Tensor: + with param_guard(self._parameters): + return self._parameters[key] + + def __setitem__(self, key: str, param: Tensor) -> None: + assert isinstance(param, Parameter) + setattr(self, key, param) + + def __len__(self) -> int: + return len(self._parameters) + + def __iter__(self) -> Iterator[str]: + return iter(self._parameters) + + def update( + self, + parameters: ( + ParameterDict | Mapping[str, Tensor] | Sequence[tuple[str, Tensor]] + ), + ) -> None: + """Update a given parameter at the end of the dict. + + Parameters: + parameters (Parameter): parameter to update + """ + assert isinstance(parameters, Iterable), ( + "The type of parameters is not iterable of key/value pairs, the type of sublayers is " + + type(parameters).__name__ + ) + + if isinstance(parameters, (OrderedDict, ParameterDict, Mapping)): + for key, parameter in parameters.items(): + self.add_parameter(key, parameter) + else: + for i, kv in enumerate(parameters): + if len(kv) != 2: + raise ValueError( + f"The length of the {i}'s element in parameters is {len(kv)}, which must be 2." + ) + self.add_parameter(kv[0], kv[1]) + + class ParameterList(Layer): """ParameterList Container. diff --git a/test/legacy_test/test_imperative_container_parameterdict.py b/test/legacy_test/test_imperative_container_parameterdict.py new file mode 100644 index 0000000000000..b17ba7cda43c2 --- /dev/null +++ b/test/legacy_test/test_imperative_container_parameterdict.py @@ -0,0 +1,90 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import OrderedDict + +import numpy as np + +import paddle +from paddle import base + + +class MyLayer(paddle.nn.Layer): + def __init__(self, num_stacked_param): + super().__init__() + # create ParameterDict with iterable Parameters + self.params = self.paddle_imperative_ParameterDict(num_stacked_param) + + def paddle_imperative_ParameterDict(self, num_stacked_param): + return paddle.nn.ParameterDict( + [ + ( + 't' + str(i), + paddle.create_parameter(shape=[2, 2], dtype='float32'), + ) + for i in range(num_stacked_param) + ] + ) + + def forward(self, x): + for i, key in enumerate(self.params): + x = paddle.matmul(x, self.params[key]) + return x + + +class TestImperativeContainerParameterDict(unittest.TestCase): + def paramter_dict(self): + self.place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32') + with base.dygraph.guard(): + x = paddle.to_tensor(data_np) + num_stacked_param = 4 + model = MyLayer(num_stacked_param) + self.assertEqual(len(model.params), num_stacked_param) + res = model(x) + self.assertListEqual(res.shape, [5, 2]) + loss = paddle.mean(res) + loss.backward() + + model.params['t' + str(num_stacked_param - 1)] = ( + paddle.create_parameter(shape=[2, 3], dtype='float32') + ) + res = model(x) + self.assertListEqual(res.shape, [5, 3]) + parmeter = OrderedDict( + [ + ( + 't' + str(num_stacked_param), + paddle.create_parameter(shape=[3, 4], dtype='float32'), + ) + ] + ) + model.params.update(parmeter) + self.assertEqual(len(model.params), num_stacked_param + 1) + res = model(x) + self.assertListEqual(res.shape, [5, 4]) + loss = paddle.mean(res) + loss.backward() + + def test_paramter_dict(self): + self.paramter_dict() + + +if __name__ == '__main__': + unittest.main() From 4bb7391eda3afe41b268746eb5cac65401f9e181 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Fri, 11 Oct 2024 10:52:56 +0800 Subject: [PATCH 072/135] [PIR] fix some code for support custom device (#68061) * fix some code for support custom device --- paddle/fluid/framework/executor_cache.cc | 14 +++ .../ir_adaptor/translator/op_translator.cc | 13 +++ .../pir/transforms/pd_op_to_kernel_pass.cc | 52 +++++----- paddle/phi/api/lib/data_transform.h | 19 ++++ paddle/phi/common/backend.h | 14 +++ paddle/phi/common/place.cc | 11 +++ paddle/phi/common/place.h | 3 +- paddle/phi/kernels/{cpu => }/data_kernel.cc | 57 ++--------- paddle/phi/kernels/data_kernel.h | 6 +- paddle/phi/kernels/gpu/data_kernel.cu | 67 ------------- paddle/phi/kernels/impl/data_impl.h | 16 +-- paddle/phi/kernels/memcpy_kernel.cc | 98 +++++-------------- python/paddle/jit/pir_translated_layer.py | 1 + python/paddle/tensor/creation.py | 2 + 14 files changed, 150 insertions(+), 223 deletions(-) rename paddle/phi/kernels/{cpu => }/data_kernel.cc (57%) delete mode 100644 paddle/phi/kernels/gpu/data_kernel.cu diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 43a2f7f9875a8..9a838e9518834 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -247,6 +247,11 @@ std::unique_ptr<::pir::Program> ConstructForwardIrProgram( // TODO(phlrain) : using tensor dtype op_desc->SetAttr("dtype", 0); op_desc->SetAttr("place", static_cast(p)); + if (p == phi::AllocationType::CUSTOM) { + op_desc->SetAttr("place_device_id", in_t.place().GetDeviceId()); + op_desc->SetAttr("place_device_type", in_t.place().GetDeviceType()); + } + op_desc->SetAttr("name", name); op_desc->SetOutput("out", {name}); } @@ -264,6 +269,10 @@ std::unique_ptr<::pir::Program> ConstructForwardIrProgram( // TODO(phlrain) : using tensor dtype op_desc->SetAttr("dtype", 0); op_desc->SetAttr("place", static_cast(p)); + if (p == phi::AllocationType::CUSTOM) { + op_desc->SetAttr("place_device_id", param.place().GetDeviceId()); + op_desc->SetAttr("place_device_type", param.place().GetDeviceType()); + } op_desc->SetAttr("name", name); op_desc->SetOutput("out", {name}); @@ -344,6 +353,11 @@ std::unique_ptr<::pir::Program> ConstructBackwardIrProgram( // TODO(phlrain) : using tensor dtype op_desc->SetAttr("dtype", 0); op_desc->SetAttr("place", static_cast(p)); + if (p == phi::AllocationType::CUSTOM) { + op_desc->SetAttr("place_device_id", tensor.place().GetDeviceId()); + op_desc->SetAttr("place_device_type", tensor.place().GetDeviceType()); + } + op_desc->SetAttr("name", var_name); op_desc->SetOutput("out", {var_name}); } diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc index dc60d9b9a0f38..9e5b092d21188 100644 --- a/paddle/fluid/ir_adaptor/translator/op_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc @@ -1527,6 +1527,19 @@ struct DataOpTranscriber : public FeedOpTranscriber { ctx, phi::Place(static_cast(allocate_type)))}, }; + if (static_cast(allocate_type) == + phi::AllocationType::CUSTOM) { + int place_device_id = + PADDLE_GET_CONST(int, op_desc.GetAttr("place_device_id")); + std::string place_device_type = + PADDLE_GET_CONST(std::string, op_desc.GetAttr("place_device_type")); + attribute_map["place"] = paddle::dialect::PlaceAttribute::get( + ctx, + phi::Place(static_cast(allocate_type), + place_device_id, + place_device_type)); + } + return attribute_map; } }; diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 244dcedd9bb5a..95635f2543283 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -211,6 +211,7 @@ static bool NeedFallBackCpu(const pir::Operation* op, return false; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static bool NeedFallBackFromGPUDNN2GPU(pir::Operation* op, const std::string& kernel_name, const phi::KernelKey kernel_key) { @@ -275,6 +276,7 @@ static bool NeedFallBackFromGPUDNN2GPU(pir::Operation* op, return false; } +#endif bool CanRunOnCpuKernel(const std::vector<::pir::Value>& vec_inputs, ::pir::Operation* op) { @@ -326,7 +328,7 @@ static phi::Backend DeriveBackend(const std::string& op, // NOTE: Parameters are initialized on executor place defined if ((op == pir::SetParameterOp::name() || op == pir::ShadowOutputOp::name()) && - place.GetType() == phi::AllocationType::GPU) { + phi::is_accelerat_allocation_type(place.GetType())) { return phi::TransToPhiBackend(place); } // Tensor Attribute should on cpu backend for better performance @@ -486,6 +488,9 @@ static pir::Value AddPlaceTransferOp(pir::Value in, case phi::AllocationType::CUSTOM: new_backend = phi::Backend::CUSTOM; break; + case phi::AllocationType::IPU: + new_backend = phi::Backend::IPU; + break; default: new_backend = phi::Backend::CPU; break; @@ -494,18 +499,14 @@ static pir::Value AddPlaceTransferOp(pir::Value in, }; std::unordered_map op_attribute; if ((src_place.GetType() == phi::AllocationType::CPU) && - (dst_place.GetType() == phi::AllocationType::GPU || - dst_place.GetType() == phi::AllocationType::XPU || - dst_place.GetType() == phi::AllocationType::CUSTOM)) { + phi::is_accelerat_allocation_type(dst_place.GetType())) { copy_kernel_key.set_backend(place2backend(dst_place.GetType())); op_attribute = { {"op_name", pir::StrAttribute::get(ctx, "pd_op.memcpy_h2d")}, {"kernel_name", pir::StrAttribute::get(ctx, "memcpy_h2d")}, {"kernel_key", KernelAttribute::get(ctx, copy_kernel_key)}, {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}}; - } else if ((src_place.GetType() == phi::AllocationType::GPU || - src_place.GetType() == phi::AllocationType::XPU || - src_place.GetType() == phi::AllocationType::CUSTOM) && + } else if (phi::is_accelerat_allocation_type(src_place.GetType()) && (dst_place.GetType() == phi::AllocationType::CPU)) { copy_kernel_key.set_backend(place2backend(src_place.GetType())); @@ -1351,10 +1352,11 @@ phi::KernelKey GetKernelKey( phi::KernelKey res(kernel_backend, kernel_layout, kernel_dtype); // kernel backend infered incorrectly from memcpy op operands, +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // case that place from (not GPU) to GPU. // We handle this special case by following code to fix up the problem. // This could be further improved if we had another method. - if (!phi::is_gpu_place(place)) { + if (!phi::is_accelerat_place(place)) { if (op->isa()) { VLOG(6) << "MemcpyOp need a special handle"; int dst_place_type = op->attribute("dst_place_type") @@ -1365,6 +1367,7 @@ phi::KernelKey GetKernelKey( } } } +#endif if (op->isa()) { res.set_dtype(phi::DataType::FLOAT32); @@ -1387,11 +1390,13 @@ phi::KernelKey GetKernelKey( VLOG(8) << "kernel backend must be on CPU when need fallback"; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (NeedFallBackFromGPUDNN2GPU(op, kernel_fn_str, res)) { res.set_backend(phi::Backend::GPU); VLOG(8) << "kernel backend must be on GPU when need fallback from GPUDNN " "to GPU"; } +#endif #ifdef PADDLE_WITH_DNNL std::regex reg(","); @@ -1651,13 +1656,7 @@ void AddShadowFeedForValue( pir::IrContext* ctx, std::unordered_map* map_op_pair, std::unordered_map* map_value_pair) { - phi::Backend backend = phi::Backend::GPU; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - backend = phi::Backend::GPU; -#endif -#if defined(PADDLE_WITH_XPU) - backend = phi::Backend::XPU; -#endif + phi::Backend backend = paddle::experimental::get_accelerat_backend(); if (op_item->result(index).type().isa()) { phi::KernelKey shadow_key{ @@ -1758,8 +1757,7 @@ void AddShadowFeedForTuplePopOp( } // if value place not gpu, add shadow feed op - if ((phi::is_gpu_place(place) || phi::is_xpu_place(place)) && - add_shadow_feed) { + if (phi::is_accelerat_place(place) && add_shadow_feed) { for (size_t i = 0; i < op_item->num_results(); ++i) { AddShadowFeedForValue(i, op_item, @@ -2154,9 +2152,10 @@ void HandleForSpecialOp( auto out_place = phi::TransToPhiPlace(dst_backend); auto out_type = AllocatedDenseTensorType::get(ctx, out_place, value_type); - phi::KernelKey kernel_key(phi::Backend::GPU, - phi::DataLayout::ANY, - TransToPhiDataType(value_type.dtype())); + phi::KernelKey kernel_key( + paddle::experimental::get_accelerat_backend(), + phi::DataLayout::ANY, + TransToPhiDataType(value_type.dtype())); new_in = AddPlaceTransferOp( new_in, out_type, in_place, out_place, kernel_key, block); } @@ -2998,11 +2997,9 @@ void AddShadowFeedOpForDataOrFeed( std::unordered_map* map_op_pair, std::unordered_map* map_value_pair) { bool feed_op_add_shadow_feed = - (op_item->isa()) && - (phi::is_gpu_place(place) || phi::is_xpu_place(place)); + (op_item->isa()) && phi::is_accelerat_place(place); bool data_op_add_shadow_feed = - (op_item->isa()) && - (phi::is_gpu_place(place) || phi::is_xpu_place(place)) && + (op_item->isa()) && phi::is_accelerat_place(place) && (kernel_op->attributes() .at("place") .dyn_cast() @@ -3275,12 +3272,13 @@ void ProcessBlock( AllocatedDenseTensorType::get(ctx, phi::Place(), dense_tensor_type)); } } - if (phi::is_gpu_place(place) || phi::is_xpu_place(place)) { + if (phi::is_accelerat_place(place)) { for (auto& [keyword, arg] : block->kwargs()) { if (auto dense_tensor_type = arg.type().dyn_cast()) { auto dtype = dense_tensor_type.dtype(); - phi::KernelKey shadow_key{ - phi::Backend::GPU, phi::DataLayout::ANY, TransToPhiDataType(dtype)}; + phi::KernelKey shadow_key{paddle::experimental::get_accelerat_backend(), + phi::DataLayout::ANY, + TransToPhiDataType(dtype)}; std::unordered_map attr_map{ {"op_name", pir::StrAttribute::get(ctx, "pd_op.shadow_feed")}, {"kernel_name", pir::StrAttribute::get(ctx, "shadow_feed")}, diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h index 9e023428a7672..096694abd8b7e 100644 --- a/paddle/phi/api/lib/data_transform.h +++ b/paddle/phi/api/lib/data_transform.h @@ -174,10 +174,29 @@ inline bool NeedTransformPlace(const phi::Place& src_place, if (!transform_flag.need_trans_backend()) { return false; } + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool ret = src_place.GetType() == AllocationType::GPUPINNED || (target != Backend::ALL_BACKEND && phi::TransToPhiBackend(src_place) != (target != Backend::GPUDNN ? target : Backend::GPU)); +#elif defined(PADDLE_WITH_XPU) + bool ret = target != Backend::ALL_BACKEND && + phi::TransToPhiBackend(src_place) != target; +#elif defined(PADDLE_WITH_IPU) + bool ret = target != Backend::ALL_BACKEND && + phi::TransToPhiBackend(src_place) != target; +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + bool ret = target != Backend::ALL_BACKEND; + if (target == Backend::CUSTOM) { + ret = ret && !is_custom_place(src_place); + } else { + ret = ret && phi::TransToPhiBackend(src_place) != target; + } +#else + bool ret = false; +#endif + #ifdef PADDLE_WITH_DNNL if (target == Backend::ONEDNN) { ret = src_place.GetType() != AllocationType::CPU; diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 64dab3ccdeb3b..eb44a3ad58889 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -193,6 +193,20 @@ inline std::string BackendToString(const Backend& backend) { } } +inline Backend get_accelerat_backend() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return Backend::GPU; +#elif defined(PADDLE_WITH_XPU) + return Backend::XPU; +#elif defined(PADDLE_WITH_IPU) + return Backend::IPU; +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + return Backend::CUSTOM; +#else + return Backend::UNDEFINED; +#endif +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 78539318b594f..8f806b7350bc7 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -179,6 +179,17 @@ bool is_custom_place(const Place &p) { return p.GetType() == phi::AllocationType::CUSTOM; } +bool is_accelerat_place(const Place &p) { + return is_gpu_place(p) || is_xpu_place(p) || is_ipu_place(p) || + is_custom_place(p); +} + +bool is_accelerat_allocation_type(AllocationType type) { + return type == phi::AllocationType::GPU || type == phi::AllocationType::XPU || + type == phi::AllocationType::IPU || + type == phi::AllocationType::CUSTOM; +} + bool places_are_same_class(const Place &p1, const Place &p2) { #ifdef PADDLE_WITH_CUSTOM_DEVICE if (is_custom_place(p1) && is_custom_place(p2)) { diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index c02c35217b26f..9b46d6f7a3cc5 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -208,9 +208,10 @@ bool is_ipu_place(const Place&); TEST_API bool is_cpu_place(const Place&); bool is_cuda_pinned_place(const Place&); bool is_custom_place(const Place& p); +bool is_accelerat_place(const Place& p); bool places_are_same_class(const Place&, const Place&); bool is_same_place(const Place&, const Place&); - +bool is_accelerat_allocation_type(AllocationType type); } // namespace phi namespace paddle { diff --git a/paddle/phi/kernels/cpu/data_kernel.cc b/paddle/phi/kernels/data_kernel.cc similarity index 57% rename from paddle/phi/kernels/cpu/data_kernel.cc rename to paddle/phi/kernels/data_kernel.cc index 2081b0bd8e748..67dcd475cb6bc 100644 --- a/paddle/phi/kernels/cpu/data_kernel.cc +++ b/paddle/phi/kernels/data_kernel.cc @@ -53,53 +53,14 @@ PD_REGISTER_KERNEL(data, phi::complex64, phi::complex128) {} -PD_REGISTER_KERNEL(shadow_feed, - CPU, - ALL_LAYOUT, - phi::ShadowFeedKernel, - bool, - uint8_t, - float, - int8_t, - int16_t, - int32_t, - int64_t, - double, - phi::float16, - phi::bfloat16, - phi::complex64, - phi::complex128) {} - -PD_REGISTER_KERNEL(shadow_feed_tensors, - CPU, - ALL_LAYOUT, - phi::ShadowFeedTensorsKernel, - bool, - uint8_t, - float, - int8_t, - int16_t, - int32_t, - int64_t, - double, - phi::float16, - phi::bfloat16, - phi::complex64, - phi::complex128) {} - -PD_REGISTER_KERNEL(print_kernel, - CPU, - ALL_LAYOUT, - phi::PrintKernel, - bool, - float, - int32_t, - int64_t, - double, - phi::float16, - phi::bfloat16, - phi::complex64, - phi::complex128) {} - +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(shadow_feed, + ALL_LAYOUT, + phi::ShadowFeedKernel) {} +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(shadow_feed_tensors, + ALL_LAYOUT, + phi::ShadowFeedTensorsKernel) {} +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(print_kernel, + ALL_LAYOUT, + phi::PrintKernel) {} PD_REGISTER_KERNEL( shadow_output, CPU, ALL_LAYOUT, phi::ShadowOutputKernel, float) {} diff --git a/paddle/phi/kernels/data_kernel.h b/paddle/phi/kernels/data_kernel.h index 96242f3ef94d4..aea3918f3cb92 100644 --- a/paddle/phi/kernels/data_kernel.h +++ b/paddle/phi/kernels/data_kernel.h @@ -31,19 +31,19 @@ void ShadowOutputKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); -template +template void ShadowFeedKernel(const Context& ctx, const DenseTensor& x, int dst_place_type, DenseTensor* out); -template +template void ShadowFeedTensorsKernel(const Context& ctx, const std::vector& xs, int dst_place_type, std::vector outs); -template +template void PrintKernel(const Context& ctx, const DenseTensor& x, int first_n, diff --git a/paddle/phi/kernels/gpu/data_kernel.cu b/paddle/phi/kernels/gpu/data_kernel.cu deleted file mode 100644 index e1634fce75274..0000000000000 --- a/paddle/phi/kernels/gpu/data_kernel.cu +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/data_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/data_impl.h" - -PD_REGISTER_KERNEL(shadow_feed, - GPU, - ALL_LAYOUT, - phi::ShadowFeedKernel, - bool, - uint8_t, - float, - int8_t, - int16_t, - int32_t, - int64_t, - double, - phi::float16, - phi::bfloat16, - phi::complex64, - phi::complex128) {} - -PD_REGISTER_KERNEL(shadow_feed_tensors, - GPU, - ALL_LAYOUT, - phi::ShadowFeedTensorsKernel, - bool, - uint8_t, - float, - int8_t, - int16_t, - int32_t, - int64_t, - double, - phi::float16, - phi::bfloat16, - phi::complex64, - phi::complex128) {} - -PD_REGISTER_KERNEL(print_kernel, - GPU, - ALL_LAYOUT, - phi::PrintKernel, - bool, - float, - int32_t, - int64_t, - double, - phi::float16, - phi::bfloat16, - phi::complex64, - phi::complex128) {} diff --git a/paddle/phi/kernels/impl/data_impl.h b/paddle/phi/kernels/impl/data_impl.h index 487840353cb12..960c31b6cf270 100644 --- a/paddle/phi/kernels/impl/data_impl.h +++ b/paddle/phi/kernels/impl/data_impl.h @@ -23,7 +23,7 @@ namespace phi { const char kForward[] = "FORWARD"; const char kBackward[] = "BACKWARD"; -template +template void ShadowFeedKernel(const Context& ctx, const DenseTensor& x, int dst_place_type, @@ -41,6 +41,10 @@ void ShadowFeedKernel(const Context& ctx, case 1: // XPUPlace target_place = XPUPlace(backends::xpu::GetXPUCurrentDeviceId()); break; +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + case 1: // CustomPlace + target_place = ctx.GetPlace(); + break; #endif default: PADDLE_THROW(errors::Unimplemented("dst_place_type: %d is not supported.", @@ -50,9 +54,9 @@ void ShadowFeedKernel(const Context& ctx, if (!x.initialized()) { if (target_place == CPUPlace()) { - ctx.template HostAlloc(out); + ctx.HostAlloc(out, out->dtype()); } else { - ctx.template Alloc(out); + ctx.Alloc(out, out->dtype()); } return; } @@ -65,17 +69,17 @@ void ShadowFeedKernel(const Context& ctx, } } -template +template void ShadowFeedTensorsKernel(const Context& ctx, const std::vector& xs, int dst_place_type, std::vector outs) { for (size_t i = 0; i < xs.size(); ++i) { - ShadowFeedKernel(ctx, *(xs[i]), dst_place_type, outs[i]); + ShadowFeedKernel(ctx, *(xs[i]), dst_place_type, outs[i]); } } -template +template void PrintKernel(const Context& ctx, const DenseTensor& x, int first_n, diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc index facee1f317a63..ddfc2cb897272 100644 --- a/paddle/phi/kernels/memcpy_kernel.cc +++ b/paddle/phi/kernels/memcpy_kernel.cc @@ -30,6 +30,11 @@ void MemcpyH2DKernel(const Context& dev_ctx, const DenseTensor& x, int dst_place_type, DenseTensor* out) { + if (!x.initialized()) { + out->set_meta(x.meta()); + return; + } + PADDLE_ENFORCE_GE( dst_place_type, 0, @@ -40,7 +45,6 @@ void MemcpyH2DKernel(const Context& dev_ctx, 3, errors::OutOfRange("dst_place_type only support 0-3, but got: %d", dst_place_type)); - Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); } @@ -123,6 +127,16 @@ void MemcpyKernel(const Context& dev_ctx, dev_ctx.Alloc(out, x.dtype(), 0, true); Copy(dev_ctx, x, GPUPinnedPlace(), false, out); break; +#elif defined(PADDLE_WITH_XPU) + case 3: // XPUPlace + dev_ctx.Alloc(out, x.dtype()); + Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + break; +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + case 4: // CustomPlace + dev_ctx.Alloc(out, x.dtype()); + Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + break; #endif default: PADDLE_THROW(errors::Unimplemented( @@ -133,79 +147,21 @@ void MemcpyKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d, - CPU, - ALL_LAYOUT, - phi::MemcpyH2DKernel) {} - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h, - CPU, - ALL_LAYOUT, - phi::MemcpyD2HKernel) { +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(memcpy_h2d, + ALL_LAYOUT, + phi::MemcpyH2DKernel) {} +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(memcpy_d2h, + ALL_LAYOUT, + phi::MemcpyD2HKernel) { kernel->OutputAt(0).SetBackend(phi::Backend::CPU); } - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h_multi_io, - CPU, - ALL_LAYOUT, - phi::MemcpyD2HMultiIOKernel) { +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(memcpy_d2h_multi_io, + ALL_LAYOUT, + phi::MemcpyD2HMultiIOKernel) { kernel->OutputAt(0).SetBackend(phi::Backend::CPU); } - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy, - CPU, - ALL_LAYOUT, - phi::MemcpyKernel) { +PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(memcpy, + ALL_LAYOUT, + phi::MemcpyKernel) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d, - GPU, - ALL_LAYOUT, - phi::MemcpyH2DKernel) {} - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h, - GPU, - ALL_LAYOUT, - phi::MemcpyD2HKernel) { - kernel->OutputAt(0).SetBackend(phi::Backend::CPU); -} - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h_multi_io, - GPU, - ALL_LAYOUT, - phi::MemcpyD2HMultiIOKernel) { - kernel->OutputAt(0).SetBackend(phi::Backend::CPU); -} - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy, - GPU, - ALL_LAYOUT, - phi::MemcpyKernel) { - kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); -} - -#endif - -#ifdef PADDLE_WITH_XPU -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d, - XPU, - ALL_LAYOUT, - phi::MemcpyH2DKernel) {} - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h, - XPU, - ALL_LAYOUT, - phi::MemcpyD2HKernel) { - kernel->OutputAt(0).SetBackend(phi::Backend::CPU); -} - -PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h_multi_io, - XPU, - ALL_LAYOUT, - phi::MemcpyD2HMultiIOKernel) { - kernel->OutputAt(0).SetBackend(phi::Backend::CPU); -} - -#endif diff --git a/python/paddle/jit/pir_translated_layer.py b/python/paddle/jit/pir_translated_layer.py index 4602ddf52d23a..9ae0758676098 100644 --- a/python/paddle/jit/pir_translated_layer.py +++ b/python/paddle/jit/pir_translated_layer.py @@ -113,6 +113,7 @@ def _preprocess(self): name=var_name, shape=org_value.shape, dtype=org_value.dtype, + place=paddle.base.core.Place(), ) org_value.replace_all_uses_with(value) value.get_defining_op().move_before(op) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index c91831bd42d06..c08babbfe6945 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -2881,6 +2881,8 @@ def _memcpy(input, place=None, output=None) -> paddle.Tensor: dst_place_type = 2 elif p.is_xpu_place(): dst_place_type = 3 + elif p.is_custom_place(): + dst_place_type = 4 if in_pir_mode(): return _C_ops.memcpy(input, dst_place_type) From f5c427993b7e82d3c7016a69bcb5353a8b3bd51e Mon Sep 17 00:00:00 2001 From: zhanghonggeng <43205915+zhanghonggeng@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:19:27 +0800 Subject: [PATCH 073/135] Add Conv2dAddActFuse & TransferLayout pass before CINN pass (#68615) --- .../fluid/inference/api/analysis_predictor.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9740c9b4bc32f..f148fac6367e2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -834,6 +834,24 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { return pass_manager; }; + if (config_.use_gpu() && config_.cinn_enabled()) { + if (!config_.custom_pass_only_) { + ::pir::PassManager fused_op_pm(::pir::IrContext::Instance(), + config_.pm_opt_level_); + const std::vector FusedOpPasses{// Operator fusion pass + "conv2d_bn_fuse_pass", + "conv2d_add_act_fuse_pass", + "conv2d_add_fuse_pass", + "transfer_layout_pass"}; + + for (const auto &fused_op : FusedOpPasses) { + fused_op_pm.AddPass(pir::PassRegistry::Instance().Get(fused_op)); + } + + fused_op_pm.Run(pir_program_.get()); + } + } + if (paddle::prim::PrimCommonUtils::IsFwdPrimEnabled()) { VLOG(4) << "[Prim] Decomp program in predictor begin."; DecompProgram decomp_object(pir_program_.get()); From 2d75c3af4867f556e4c931218600ea2ab2a0b235 Mon Sep 17 00:00:00 2001 From: Nana <49900969+NKNaN@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:42:36 +0800 Subject: [PATCH 074/135] =?UTF-8?q?Determinant=20kernel=20support=20comple?= =?UTF-8?q?x=20=E6=98=93=E7=94=A8=E6=80=A7=E6=8F=90=E5=8D=87=20(#68390)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update determinant kernel * fix ci coverage and DCU build error --- .../kernels/cpu/determinant_grad_kernel.cc | 4 +- paddle/phi/kernels/cpu/determinant_kernel.cc | 10 +- .../kernels/gpu/determinant_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/determinant_kernel.cu | 235 +++++++++++++++++- .../impl/determinant_grad_kernel_impl.h | 7 + .../kernels/impl/determinant_kernel_impl.h | 35 ++- python/paddle/tensor/linalg.py | 7 +- test/legacy_test/test_determinant_op.py | 85 ++++++- 8 files changed, 376 insertions(+), 11 deletions(-) diff --git a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc index e57d7263f88bf..0eb588c0dc4b4 100644 --- a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc @@ -22,4 +22,6 @@ PD_REGISTER_KERNEL(determinant_grad, ALL_LAYOUT, phi::DeterminantGradKernel, float, - double) {} + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/determinant_kernel.cc b/paddle/phi/kernels/cpu/determinant_kernel.cc index 5810e88e92527..fe212b848b66d 100644 --- a/paddle/phi/kernels/cpu/determinant_kernel.cc +++ b/paddle/phi/kernels/cpu/determinant_kernel.cc @@ -17,5 +17,11 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/determinant_kernel_impl.h" -PD_REGISTER_KERNEL( - determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} +PD_REGISTER_KERNEL(determinant, + CPU, + ALL_LAYOUT, + phi::DeterminantKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu index f3187d5fefb51..26cb97f74866b 100644 --- a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(determinant_grad, phi::DeterminantGradKernel, phi::dtype::float16, float, - double) {} + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu index 58e27e3ce4abd..19d9c87bc8eb3 100644 --- a/paddle/phi/kernels/gpu/determinant_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -14,8 +14,237 @@ #include "paddle/phi/kernels/determinant_kernel.h" +#include +#include +#include +#include +#include +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" + +#include "glog/logging.h" +#include "paddle/phi/common/amp_type_traits.h" + +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { +namespace detail { +template +class EigenMatrix {}; + +template <> +class EigenMatrix { + public: + using MatrixType = + Eigen::Matrix; +}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXf; +}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXd; +}; + +inline int64_t GetBatchCount(const DDim dims) { + int64_t batch_count = 1; + auto dim_size = dims.size(); + PADDLE_ENFORCE_GE( + dim_size, + 2, + common::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + + // Cumulative multiplying each dimension until the last 2 to get the batch + // count, + // for example a tensor with shape [3,3,3,3], the batch count of matrices is + // 9. + for (int64_t i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + + return batch_count; +} +} // namespace detail + +template +struct DeterminantCudaFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& input, + int64_t rank, + int64_t batch_count, + DenseTensor* output) { + std::vector input_vec; + std::vector output_vec; + phi::TensorToVector(input, dev_ctx, &input_vec); + using MPType = typename phi::dtype::MPTypeTrait::Type; + for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel + auto begin_iter = input_vec.begin() + i * rank * rank; + auto end_iter = input_vec.begin() + (i + 1) * rank * rank; + std::vector sub_vec(begin_iter, + end_iter); // get every square matrix data + typename detail::EigenMatrix::MatrixType matrix(rank, rank); + for (int64_t i = 0; i < rank; ++i) { + for (int64_t j = 0; j < rank; ++j) { + matrix(i, j) = sub_vec[rank * i + j]; + } + } + output_vec.push_back( + static_cast(matrix.template cast().determinant())); + } + phi::TensorFromVector(output_vec, dev_ctx, output); + } +}; + +template +__global__ void GetDetFromLUComplex(const T* lu_data, + const int* ipiv, + int64_t n, + int64_t batch_size, + T* out_data) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < batch_size) { + int offset_lu = idx * n * n; + int offset_ipiv = idx * n; + T out_idx = T(1.0, 0.0); + T negative = T(-1.0, 0.0); + for (int i = 0; i < n; ++i) { + out_idx *= lu_data[offset_lu + i * n + i]; + if (ipiv[offset_ipiv + i] != i + 1) { + out_idx *= negative; + } + } + out_data[idx] = out_idx; + } +} + +template +struct DeterminantCudaFunctor, Context> { + void operator()(const Context& dev_ctx, + const DenseTensor& a, + int64_t n, + int64_t batch_size, + DenseTensor* output) { +#ifndef PADDLE_WITH_HIP + phi::Allocator::AllocationPtr tmp_gpu_mat_data; + const phi::dtype::complex* gpu_mat = a.data>(); + // Copy all elements of input matrix A to a temporary memory space to + // avoid being overriden by getrf. + tmp_gpu_mat_data = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + a.numel() * sizeof(phi::dtype::complex), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + memory_utils::Copy(dev_ctx.GetPlace(), + tmp_gpu_mat_data->ptr(), + dev_ctx.GetPlace(), + a.data(), + a.numel() * sizeof(phi::dtype::complex), + dev_ctx.stream()); + gpu_mat = reinterpret_cast*>( + tmp_gpu_mat_data->ptr()); + + std::vector*> cpu_ptrs(batch_size); + for (int i = 0; i < batch_size; ++i) { + cpu_ptrs[i] = gpu_mat + i * n * n; + } + + int num_ints = batch_size * (n + 1); + // num_ints is for pivot (n * batch_size) and info (batch_size) + size_t total_bytes = + batch_size * sizeof(phi::dtype::complex*) + num_ints * sizeof(int); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + total_bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + memory_utils::Copy(dev_ctx.GetPlace(), + tmp_gpu_ptrs_data->ptr(), + phi::CPUPlace(), + static_cast(cpu_ptrs.data()), + cpu_ptrs.size() * sizeof(phi::dtype::complex*), + dev_ctx.stream()); + + phi::dtype::complex** gpu_mat_ptr = + reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); + int* gpu_info_ptr = reinterpret_cast(gpu_mat_ptr + cpu_ptrs.size()); + int* pivot_data = gpu_info_ptr + batch_size; + + auto blas = phi::funcs::GetBlas>(dev_ctx); + // This function performs the LU factorization of each matrix A by the + // equation P * A = L * U. L and U are written back to original matrix A, + // and diagonal elements of L are discarded. + blas.BatchedGETRF(n, gpu_mat_ptr, pivot_data, gpu_info_ptr, batch_size); + phi::dtype::complex* out_data = + dev_ctx.template Alloc>(output); + int block_size = std::min(256, dev_ctx.GetMaxThreadsPerBlock()); + dim3 dim_block(block_size); + dim3 num_blocks((batch_size + block_size - 1) / block_size); + GetDetFromLUComplex><<>>( + gpu_mat, pivot_data, n, batch_size, out_data); +#else + using MatrixType = + Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>; + std::vector> input_vec; + std::vector> output_vec; + phi::TensorToVector(a, dev_ctx, &input_vec); + for (int64_t i = 0; i < batch_size; ++i) { // maybe can be parallel + auto begin_iter = input_vec.begin() + i * n * n; + auto end_iter = input_vec.begin() + (i + 1) * n * n; + std::vector> sub_vec( + begin_iter, + end_iter); // get every square matrix data + MatrixType matrix(n, n); + for (int64_t i = 0; i < n; ++i) { + for (int64_t j = 0; j < n; ++j) { + matrix(i, j) = static_cast>(sub_vec[n * i + j]); + } + } + output_vec.push_back( + static_cast>(matrix.determinant())); + } + phi::TensorFromVector(output_vec, dev_ctx, output); +#endif + } +}; + +template +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto input_dim = common::vectorize(x.dims()); + auto input_dim_size = input_dim.size(); + + auto batch_count = detail::GetBatchCount(x.dims()); + VLOG(10) << "input dim:" << x.dims(); + PADDLE_ENFORCE_GE( + input_dim_size, + 2, + common::errors::InvalidArgument("the input matrix dimension size should " + "greater than or equal to 2.")); + PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], + input_dim[input_dim_size - 2], + common::errors::InvalidArgument( + "the input matrix should be square matrix.")); + auto rank = input_dim[input_dim_size - 1]; // square matrix length + DeterminantCudaFunctor()(dev_ctx, x, rank, batch_count, out); + auto output_dims = common::slice_ddim(x.dims(), 0, input_dim_size - 2); + if (input_dim_size > 2) { + out->Resize(output_dims); + } else { + // when input is a two-dimension matrix, The det value is a number. + out->Resize(common::make_ddim({})); + } + VLOG(10) << "output dim:" << out->dims(); +} + +} // namespace phi PD_REGISTER_KERNEL(determinant, GPU, @@ -23,4 +252,6 @@ PD_REGISTER_KERNEL(determinant, phi::DeterminantKernel, phi::dtype::float16, float, - double) {} + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h index bc3ee4f7c4d31..0dd89e0048507 100644 --- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h @@ -19,6 +19,7 @@ #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/determinant_grad_kernel.h" #include "paddle/phi/kernels/elementwise_multiply_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -169,6 +170,12 @@ void DeterminantGradKernel(const Context& dev_ctx, res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); } + // result for complex input should conjugate at the last step + if (std::is_same>::value || + std::is_same>::value) { + res = phi::Conj(dev_ctx, res); + } + VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); x_grad->Resize(x.dims()); diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h index 51d01a0cd5084..87b7f8b355ee2 100644 --- a/paddle/phi/kernels/impl/determinant_kernel_impl.h +++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h @@ -101,6 +101,37 @@ struct DeterminantFunctor { } }; +template +struct DeterminantFunctor, Context> { + void operator()(const Context& dev_ctx, + const DenseTensor& input, + int64_t rank, + int64_t batch_count, + DenseTensor* output) { + using MatrixType = + Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>; + std::vector> input_vec; + std::vector> output_vec; + phi::TensorToVector(input, dev_ctx, &input_vec); + for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel + auto begin_iter = input_vec.begin() + i * rank * rank; + auto end_iter = input_vec.begin() + (i + 1) * rank * rank; + std::vector> sub_vec( + begin_iter, + end_iter); // get every square matrix data + MatrixType matrix(rank, rank); + for (int64_t i = 0; i < rank; ++i) { + for (int64_t j = 0; j < rank; ++j) { + matrix(i, j) = static_cast>(sub_vec[rank * i + j]); + } + } + output_vec.push_back( + static_cast>(matrix.determinant())); + } + phi::TensorFromVector(output_vec, dev_ctx, output); + } +}; + template void DeterminantKernel(const Context& dev_ctx, const DenseTensor& x, @@ -113,8 +144,8 @@ void DeterminantKernel(const Context& dev_ctx, PADDLE_ENFORCE_GE( input_dim_size, 2, - common::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); + common::errors::InvalidArgument("the input matrix dimension size should " + "greater than or equal to 2.")); PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], input_dim[input_dim_size - 2], common::errors::InvalidArgument( diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 669feb60a6096..4f6a974e6fc97 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2742,7 +2742,12 @@ def det(x: Tensor, name: str | None = None) -> Tensor: if in_dynamic_or_pir_mode(): return _C_ops.det(x) else: - check_dtype(x.dtype, 'Input', ['float16', 'float32', 'float64'], 'det') + check_dtype( + x.dtype, + 'Input', + ['float16', 'float32', 'float64', 'complex64', 'complex128'], + 'det', + ) input_shape = list(x.shape) assert len(input_shape) >= 2, ( diff --git a/test/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py index 4eae7927fea73..cdbd86cfc3f04 100644 --- a/test/legacy_test/test_determinant_op.py +++ b/test/legacy_test/test_determinant_op.py @@ -78,17 +78,76 @@ def init_data(self): ) +class TestDeterminantOpCase3(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + self.case = np.vectorize(complex)( + np.random.rand(10, 10), np.random.rand(10, 10) + ).astype('complex64') + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case) + + +class TestDeterminantOpCase4(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + self.case = np.vectorize(complex)( + np.random.rand(10, 10), np.random.rand(10, 10) + ).astype('complex128') + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case) + + +class TestDeterminantOpCase5(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + # not invertible matrix + self.case = np.ones([4, 2, 4, 4]).astype('complex64') + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case) + + +class TestDeterminantOpCase6(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + # not invertible matrix + self.case = np.ones([4, 2, 4, 4]).astype('complex128') + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case) + + +class TestDeterminantOpCase7(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + self.case = np.vectorize(complex)( + np.random.rand(5, 3, 10, 10), np.random.rand(5, 3, 10, 10) + ).astype('complex64') + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case) + + +class TestDeterminantOpCase8(TestDeterminantOp): + def init_data(self): + np.random.seed(0) + self.case = np.vectorize(complex)( + np.random.rand(5, 3, 10, 10), np.random.rand(5, 3, 10, 10) + ).astype('complex128') + self.inputs = {'Input': self.case} + self.target = np.linalg.det(self.case) + + class TestDeterminantAPI(unittest.TestCase): def setUp(self): np.random.seed(0) + self.dtype = np.float32 self.shape = [3, 3, 5, 5] - self.x = np.random.random(self.shape).astype(np.float32) + self.x = np.random.random(self.shape).astype(self.dtype) self.place = paddle.CPUPlace() def test_api_static(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): - x = paddle.static.data('X', self.shape) + x = paddle.static.data('X', self.shape, dtype=self.dtype) out_value = paddle.linalg.det(x) exe = paddle.static.Executor(self.place) (out_np,) = exe.run(feed={'X': self.x}, fetch_list=[out_value]) @@ -107,6 +166,28 @@ def test_api_dygraph(self): paddle.enable_static() +class TestDeterminantAPIComplex(TestDeterminantAPI): + def setUp(self): + np.random.seed(0) + self.dtype = np.complex64 + self.shape = [2, 1, 4, 3, 6, 6] + self.x = np.vectorize(complex)( + np.random.random(self.shape), np.random.random(self.shape) + ).astype(self.dtype) + self.place = paddle.CPUPlace() + + +class TestDeterminantAPIComplex2(TestDeterminantAPI): + def setUp(self): + np.random.seed(0) + self.dtype = np.complex128 + self.shape = [3, 3, 5, 5] + self.x = np.vectorize(complex)( + np.random.random(self.shape), np.random.random(self.shape) + ).astype(self.dtype) + self.place = paddle.CPUPlace() + + class TestSlogDeterminantOp(OpTest): def setUp(self): self.op_type = "slogdeterminant" From c0b11bb7acea31dc045c29e4d6c8d1c34d50870d Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:23:35 +0800 Subject: [PATCH 075/135] Add AutoLayoutPass and AutoLayoutSimplifyPass (#67576) * add two pass. * skip filter transpose in conv2d. * add flag. * fix comment and split part of infer into a new PR. * extract func for not implement infermeta. * fix * add test. * fix coverage --- paddle/common/flags.cc | 12 + .../transforms/general/auto_layout_pass.cc | 230 ++++++++++++++++++ .../pir/transforms/general/auto_layout_pass.h | 26 ++ .../general/auto_layout_simplify_pass.cc | 108 ++++++++ .../general/auto_layout_simplify_pass.h | 26 ++ paddle/fluid/pir/transforms/passes.h | 2 + .../jit/dy2static/pir_partial_program.py | 7 + python/paddle/jit/dy2static/utils.py | 6 + test/cpp/pir/pass/CMakeLists.txt | 2 + test/cpp/pir/pass/auto_layout_pass_test.cc | 69 ++++++ test/cpp/pir/pass/auto_layout_program.txt | 16 ++ 11 files changed, 504 insertions(+) create mode 100644 paddle/fluid/pir/transforms/general/auto_layout_pass.cc create mode 100644 paddle/fluid/pir/transforms/general/auto_layout_pass.h create mode 100644 paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.cc create mode 100644 paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.h create mode 100644 test/cpp/pir/pass/auto_layout_pass_test.cc create mode 100644 test/cpp/pir/pass/auto_layout_program.txt diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc index 0fc9e163a5655..fd0cc0024d510 100644 --- a/paddle/common/flags.cc +++ b/paddle/common/flags.cc @@ -1394,6 +1394,18 @@ PHI_DEFINE_EXPORTED_bool( false, "EinsumOp backward will be speedup at the expense of more gpu memory."); +/** + * Performance related FLAG + * Name: enable_auto_layout_pass + * Since Version: 3.0.0 + * Value Range: bool, default=false + * Example: + * Note: If True, using AutoLayoutPass and AutuLayoutSimplifyPass by default + */ +PHI_DEFINE_EXPORTED_bool(enable_auto_layout_pass, + false, + "Whether enable auto_layout_pass."); + /** * JitLayer related FLAG * Name: FLAGS_jit_engine_type diff --git a/paddle/fluid/pir/transforms/general/auto_layout_pass.cc b/paddle/fluid/pir/transforms/general/auto_layout_pass.cc new file mode 100644 index 0000000000000..9eae49aa56a46 --- /dev/null +++ b/paddle/fluid/pir/transforms/general/auto_layout_pass.cc @@ -0,0 +1,230 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pir/transforms/general/auto_layout_pass.h" + +#include +#include +#include +#include + +#include "paddle/common/enforce.h" +#include "paddle/common/layout.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/pir/include/core/builtin_dialect.h" +#include "paddle/pir/include/core/ir_context.h" +#include "paddle/pir/include/core/op_trait.h" +#include "paddle/pir/include/core/program.h" +#include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" +#include "paddle/pir/include/pass/utils.h" + +namespace { + +class AutoLayoutPass : public pir::Pass { + public: + AutoLayoutPass() : pir::Pass("auto_layout_pass", 3) {} + void Run(pir::Operation* op) override { + for (size_t i = 0; i < op->num_regions(); ++i) { + auto& region = op->region(i); + for (auto& block : region) { + pir::Builder builder = pir::Builder(ctx_, &block); + VLOG(4) << "Transforming block"; + TransferLayout(builder, &block); + } + } + } + + bool CanApplyOn(pir::Operation* op) const override { + return op->num_regions() > 0; + } + + private: + void RewriteLayout(pir::Operation* op, + const std::vector& input_values) { // NOLINT + auto InferMetaSpecificOp = [&]() { + // Op not implement InferMetaInterface interface, so we need to rewrite + // manually + if (op->isa()) { + auto out = op->dyn_cast().out(); + std::vector new_out_type; + for (auto v : op->operands_source()) { + new_out_type.push_back(v.type()); + } + auto new_out_type_v = + pir::VectorType::get(pir::IrContext::Instance(), new_out_type); + out.set_type(new_out_type_v); + } else { + PADDLE_THROW(common::errors::Unimplemented( + "`%s` should implement InferMetaInterface interface or rewrite " + "manually, but not found.", + op->name())); + } + }; + + if (op->HasAttribute("data_format")) { + op->set_attribute("data_format", pir::StrAttribute::get(ctx_, "NHWC")); + } + auto p_attribute_map = op->attributes(); + + if (auto infer_meta_interface = + op->dyn_cast()) { + auto output_types = + infer_meta_interface.InferMeta(input_values, &p_attribute_map); + for (size_t i = 0; i < output_types.size(); ++i) { + op->result(i).set_type(output_types[i]); + pir::SetNewLayoutForValue(op->result(i), common::DataLayout::NHWC); + } + } else { + InferMetaSpecificOp(); + } + } + + bool IsInsertTransposeOpBefore(pir::Operation* op) { + bool is_insert_transpose = false; + + auto JudgeOperand = [&](const pir::Value& operand, + std::vector layout) { + if (!JudgeValue(operand)) return false; + auto transposeInputOp = + operand.defining_op(); + if (!transposeInputOp) return false; + const auto perm_attr = + transposeInputOp.attribute("perm"); + std::vector perm; + for (size_t i = 0; i < perm_attr.size(); ++i) { + auto attr = perm_attr.at(i); + perm.push_back(attr.dyn_cast().data()); + } + return perm == layout; + }; + for (pir::Value operand : op->operands_source()) { + if (operand.type().isa()) { + auto defined_op = operand.defining_op(); + for (auto inner_operand : defined_op->operands_source()) { + is_insert_transpose = JudgeOperand(inner_operand, NHWC2NCHW_); + if (is_insert_transpose) break; + } + } else { + is_insert_transpose = JudgeOperand(operand, NHWC2NCHW_); + } + if (is_insert_transpose) break; + } + return is_insert_transpose; + } + + void TransferLayout(pir::Builder builder, pir::Block* block) { + for (auto&& op_item : *block) { + auto op = &op_item; + auto op_name = op->name(); + + // Skip special ops. + if (op->HasTrait()) continue; + if (op->operands().size() == 0) continue; + + // NHWC ops branch, Only support conv2d now, it will add white list later. + if (op->isa()) { + if (op->HasAttribute("data_format") && + op->attribute("data_format").AsString() == + "NCHW") { + VLOG(4) << "enter NHWC op: " << op_name; + DoTransposeOpOperand(op, builder); + RewriteLayout(op, op->operands_source()); + DoTransposeOpResult(op, builder); + } + } else if (IsInsertTransposeOpBefore(op)) { + VLOG(4) << "enter NCHW op: " << op_name; + DoTransposeOpOperand(op, builder); + RewriteLayout(op, op->operands_source()); + DoTransposeOpResult(op, builder); + } + } + } + + // Skip the operand which is not dense tensor or not 4-D tensor, they don't + // need transpose. + bool JudgeValue(const pir::Value& value) { + if (!value) { + PADDLE_THROW(common::errors::Fatal( + "value is null, please check the input tensor.")); + } + if (!value.type()) { + PADDLE_THROW(common::errors::Fatal( + "value type is null, please check the input tensor type.")); + } + if (auto type = value.type().dyn_cast()) { + return type.dims().size() == 4; + } + return false; + } + + void DoTransposeOpOperand(pir::Operation* op, + pir::Builder& builder) { // NOLINT + builder.set_insertion_point(op); + + // For conv2d, only transpose the input. + if (op->isa()) { + auto inp = op->operand(0); + if (!JudgeValue(inp.source())) return; + auto transpose_op = + builder.Build(inp.source(), NCHW2NHWC_); + pir::SetNewLayoutForValue(transpose_op->result(0), + common::DataLayout::NHWC); + inp.set_source(transpose_op->result(0)); + return; + } + + for (auto& operand : op->operands()) { + if (!JudgeValue(operand.source())) continue; + // Canbe optimize with cache when not eliminate the transpose op. + auto transpose_op = builder.Build( + operand.source(), NCHW2NHWC_); + pir::SetNewLayoutForValue(transpose_op->result(0), + common::DataLayout::NHWC); + operand.set_source(transpose_op->result(0)); + } + } + void DoTransposeOpResult(pir::Operation* op, + pir::Builder& builder) { // NOLINT + builder.SetInsertionPointAfter(op); + for (auto& result : op->results()) { + if (result.use_empty()) continue; + if (!JudgeValue(result)) continue; + auto transpose_op = + builder.Build(result, NHWC2NCHW_); + pir::SetNewLayoutForValue(transpose_op->result(0), + common::DataLayout::NCHW); + result.ReplaceAllUsesWith(transpose_op->result(0)); + transpose_op->operand(0).set_source(result); + } + } + pir::IrContext* ctx_ = pir::IrContext::Instance(); + const std::vector NCHW2NHWC_ = {0, 2, 3, 1}; + const std::vector NHWC2NCHW_ = {0, 3, 1, 2}; +}; +} // namespace +namespace pir { + +std::unique_ptr CreateAutoLayoutPass() { + return std::make_unique(); +} + +} // namespace pir + +REGISTER_IR_PASS(auto_layout_pass, AutoLayoutPass); diff --git a/paddle/fluid/pir/transforms/general/auto_layout_pass.h b/paddle/fluid/pir/transforms/general/auto_layout_pass.h new file mode 100644 index 0000000000000..36b7fe27d7694 --- /dev/null +++ b/paddle/fluid/pir/transforms/general/auto_layout_pass.h @@ -0,0 +1,26 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/pir/include/core/dll_decl.h" + +namespace pir { + +class Pass; + +IR_API std::unique_ptr CreateAutoLayoutPass(); + +} // namespace pir diff --git a/paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.cc b/paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.cc new file mode 100644 index 0000000000000..ef192360b2c2f --- /dev/null +++ b/paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.h" + +#include +#include +#include +#include + +#include "paddle/common/enforce.h" +#include "paddle/common/errors.h" +#include "paddle/common/layout.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h" +#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/pir/include/core/builtin_dialect.h" +#include "paddle/pir/include/core/ir_context.h" +#include "paddle/pir/include/core/program.h" +#include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" +#include "paddle/pir/include/pass/utils.h" + +namespace { + +class RedundantTransposePattern + : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern::OpRewritePattern; + + bool Match(paddle::dialect::TransposeOp op) const override { + auto before_transpose = op.x().defining_op(); + if (!before_transpose->isa()) { + return false; + } + const auto before_perm_attr = + before_transpose->attribute("perm"); + + std::vector before_perm; + for (size_t i = 0; i < before_perm_attr.size(); ++i) { + auto attr = before_perm_attr.at(i); + before_perm.push_back(attr.dyn_cast().data()); + } + + const auto after_perm_attr = op.attribute("perm"); + std::vector after_perm; + for (size_t i = 0; i < after_perm_attr.size(); ++i) { + auto attr = after_perm_attr.at(i); + after_perm.push_back(attr.dyn_cast().data()); + } + + if (before_perm == NCHW2NHWC_ && after_perm == NHWC2NCHW_) return true; + if (before_perm == NHWC2NCHW_ && after_perm == NCHW2NHWC_) return true; + return false; + } + void Rewrite(paddle::dialect::TransposeOp op, + pir::PatternRewriter& rewriter) const override { + auto before_transpose = + op.x().defining_op()->dyn_cast(); + rewriter.ReplaceAllUsesWith(op.out(), before_transpose.x()); + rewriter.EraseOp(op); + if (before_transpose.out().use_empty()) { + rewriter.EraseOp(before_transpose); + } + } + + private: + const std::vector NCHW2NHWC_ = {0, 2, 3, 1}; + const std::vector NHWC2NCHW_ = {0, 3, 1, 2}; +}; +class AutoLayoutSimplifyPass : public pir::PatternRewritePass { + public: + AutoLayoutSimplifyPass() + : pir::PatternRewritePass("auto_layout_simplify_pass", 3) {} + pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override { + pir::RewritePatternSet ps(context); + ps.Add(context); + return ps; + } + + bool CanApplyOn(pir::Operation* op) const override { + return op->num_regions() > 0; + } +}; +} // namespace +namespace pir { + +std::unique_ptr CreateAutoLayoutSimplifyPass() { + return std::make_unique(); +} + +} // namespace pir + +REGISTER_IR_PASS(auto_layout_simplify_pass, AutoLayoutSimplifyPass); diff --git a/paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.h b/paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.h new file mode 100644 index 0000000000000..ffc7c32f242fd --- /dev/null +++ b/paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.h @@ -0,0 +1,26 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/pir/include/core/dll_decl.h" + +namespace pir { + +class Pass; + +IR_API std::unique_ptr CreateAutoLayoutSimplifyPass(); + +} // namespace pir diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h index 86f9f6656847e..78d3bafbb8211 100644 --- a/paddle/fluid/pir/transforms/passes.h +++ b/paddle/fluid/pir/transforms/passes.h @@ -47,6 +47,8 @@ USE_PIR_PASS(delete_quant_dequant_linear_op_pass); USE_PIR_PASS(transfer_layout_pass); USE_PIR_PASS(fused_rotary_position_embedding_pass); USE_PIR_PASS(horizontal_fuse_pass); +USE_PIR_PASS(auto_layout_simplify_pass); +USE_PIR_PASS(auto_layout_pass); USE_PIR_PASS(common_subexpression_elimination_pass); USE_PIR_PASS(add_shadow_output_after_dead_parameter_pass); diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py index 3aa3eb1920364..4effecf5f9470 100644 --- a/python/paddle/jit/dy2static/pir_partial_program.py +++ b/python/paddle/jit/dy2static/pir_partial_program.py @@ -35,6 +35,7 @@ from .logging_utils import TranslatorLogger from .utils import ( RETURN_NO_VALUE_MAGIC_NUM, + auto_layout_is_enabled, backend_guard, cinn_is_enabled, cse_is_enabled, @@ -673,6 +674,12 @@ def _get_scope(self, program_id=None, use_scope_cache=False): # whole @switch_to_static_graph def _create_program(self, is_infer_mode=False): + if auto_layout_is_enabled(): + pm = paddle.pir.PassManager(3) + pm.add_pass("auto_layout_pass", {}) + pm.add_pass("auto_layout_simplify_pass", {}) + pm.run(self._origin_main_program) + if is_infer_mode: def pass_fn(forward_program, backward_program, program_name_attr): diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py index c4f48916ef6de..8f549a95218c7 100644 --- a/python/paddle/jit/dy2static/utils.py +++ b/python/paddle/jit/dy2static/utils.py @@ -678,6 +678,12 @@ def is_api_in_module_helper(obj, module_prefix): return m is not None and m.__name__.startswith(module_prefix) +def auto_layout_is_enabled(): + return paddle.get_flags(["FLAGS_enable_auto_layout_pass"])[ + "FLAGS_enable_auto_layout_pass" + ] + + def is_builtin(func, name=None): """predict whether a function is a builtin function with name={name}. if name == None, then any builtin function will return True diff --git a/test/cpp/pir/pass/CMakeLists.txt b/test/cpp/pir/pass/CMakeLists.txt index 904d6e5e4e7ab..abb1ea150a280 100644 --- a/test/cpp/pir/pass/CMakeLists.txt +++ b/test/cpp/pir/pass/CMakeLists.txt @@ -17,4 +17,6 @@ if(WITH_GPU) # be build only in CI, so suppose the generator in Windows is Ninja. copy_onnx(transfer_layout_pass_test) endif() + + paddle_test(auto_layout_pass_test SRCS auto_layout_pass_test.cc) endif() diff --git a/test/cpp/pir/pass/auto_layout_pass_test.cc b/test/cpp/pir/pass/auto_layout_pass_test.cc new file mode 100644 index 0000000000000..b8a2690b59454 --- /dev/null +++ b/test/cpp/pir/pass/auto_layout_pass_test.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/common/layout.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#include "paddle/fluid/ir_adaptor/translator/translate.h" +#include "paddle/fluid/pir/dialect/operator/interface/layout_transformation.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" +#include "paddle/fluid/pir/dialect/operator/ir/op_type.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/fluid/pir/transforms/general/auto_layout_pass.h" +#include "paddle/fluid/pir/transforms/general/auto_layout_simplify_pass.h" +#include "paddle/fluid/pir/transforms/general/constant_folding_pass.h" +#include "paddle/fluid/pir/transforms/passes.h" +#include "paddle/pir/include/core/builtin_dialect.h" +#include "paddle/pir/include/core/ir_context.h" +#include "paddle/pir/include/core/parser/ir_parser.h" +#include "paddle/pir/include/core/program.h" +#include "paddle/pir/include/pass/pass_manager.h" +using pir::IrParser; + +TEST(auto_layout_pass, pass_test) { + pir::IrContext* ctx = pir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + ctx->GetOrRegisterDialect(); + + std::stringstream ss; + std::ifstream file("auto_layout_program.txt"); + if (file) { + ss << file.rdbuf(); + } + + auto program = pir::IrParser(ctx, ss).ParseProgram(); + + pir::PassManager auto_layout_pm(::pir::IrContext::Instance(), 3); + auto_layout_pm.AddPass(pir::CreateAutoLayoutPass()); + auto_layout_pm.AddPass(pir::CreateAutoLayoutSimplifyPass()); + auto_layout_pm.Run(program.get()); +} diff --git a/test/cpp/pir/pass/auto_layout_program.txt b/test/cpp/pir/pass/auto_layout_program.txt new file mode 100644 index 0000000000000..2192653114fb0 --- /dev/null +++ b/test/cpp/pir/pass/auto_layout_program.txt @@ -0,0 +1,16 @@ +{ + (%0) = "builtin.parameter" () {parameter_name:"conv2d_0.w_0",persistable:[true],stop_gradient:[false]} : () -> builtin.tensor<64x8x7x7xf32> + (%1) = "builtin.parameter" () {parameter_name:"batch_norm_0.b_0",persistable:[true],stop_gradient:[false]} : () -> builtin.tensor<64xf32> + (%2) = "builtin.parameter" () {parameter_name:"batch_norm_0.w_1",persistable:[true],stop_gradient:[true]} : () -> builtin.tensor<64xf32> + (%3) = "builtin.parameter" () {parameter_name:"batch_norm_0.w_0",persistable:[true],stop_gradient:[false]} : () -> builtin.tensor<64xf32> + (%4) = "builtin.parameter" () {parameter_name:"batch_norm_0.w_2",persistable:[true],stop_gradient:[true]} : () -> builtin.tensor<64xf32> + (%5) = "builtin.parameter" () {parameter_name:"conv2d_2.w_0",persistable:[true],stop_gradient:[false]} : () -> builtin.tensor<64x64x3x3xf32> + (%6) = "pd_op.feed" () {col:(Int32)1,name:"label",persistable:[false],stop_gradient:[true]} : () -> builtin.tensor<-1x1xi64> + (%7) = "pd_op.feed" () {col:(Int32)0,name:"data",persistable:[false],stop_gradient:[true]} : () -> builtin.tensor<-1x8x224x224xf32> + (%8) = "pd_op.conv2d" (%7, %0) {data_format:"NCHW",dilations:[(Int32)1,(Int32)1],groups:(Int32)1,padding_algorithm:"EXPLICIT",paddings:[(Int32)3,(Int32)3],persistable:[false],stop_gradient:[false],strides:[(Int32)2,(Int32)2]} : (builtin.tensor<-1x3x224x224xf32>, builtin.tensor<64x3x7x7xf32>) -> builtin.tensor<-1x64x112x112xf32> + (%9, %10, %11, %12, %13, %14) = "pd_op.batch_norm_" (%8, %2, %4, %3, %1) {data_format:"NCHW",epsilon:(Float)1e-05,is_test:false,momentum:(Float)0.9,persistable:[false,true,true,false,false,false],stop_gradient:[false,true,true,true,true,true],trainable_statistics:false,use_global_stats:false} : (builtin.tensor<-1x64x112x112xf32>, builtin.tensor<64xf32>, builtin.tensor<64xf32>, builtin.tensor<64xf32>, builtin.tensor<64xf32>) -> builtin.tensor<-1x64x112x112xf32>, builtin.tensor<64xf32>, builtin.tensor<64xf32>, builtin.tensor<64xf32>, builtin.tensor<64xf32>, builtin.tensor<-1xf32> + (%15) = "pd_op.relu" (%9) {persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x64x112x112xf32>) -> builtin.tensor<-1x64x112x112xf32> + (%16) = "pd_op.conv2d" (%15, %5) {data_format:"NCHW",dilations:[(Int32)1,(Int32)1],groups:(Int32)1,padding_algorithm:"EXPLICIT",paddings:[(Int32)1,(Int32)1],persistable:[false],stop_gradient:[false],strides:[(Int32)1,(Int32)1]} : (builtin.tensor<-1x64x56x56xf32>, builtin.tensor<64x64x3x3xf32>) -> builtin.tensor<-1x64x56x56xf32> + (%17) = "builtin.combine" (%9, %15) {} : (builtin.tensor<-1x64x112x112xf32>, builtin.tensor<-1x64x112x112xf32>) -> vec[builtin.tensor<-1x64x112x112xf32>,builtin.tensor<-1x64x112x112xf32>] + (%18) = "pd_op.add_n" (%17) {persistable:[false],stop_gradient:[false]} : (vec[builtin.tensor<-1x64x112x112xf32>,builtin.tensor<-1x64x112x112xf32>]) -> builtin.tensor<-1x64x112x112xf32> +} From 4e236deed853647d47af662778f888381513fd1b Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:07:12 +0800 Subject: [PATCH 076/135] fix arange op decomp bug (#68606) --- paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml | 1 + paddle/phi/ops/yaml/inconsistent/update_ops.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml b/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml index c7744ae59233b..0bc5f97ba1213 100755 --- a/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml @@ -31,6 +31,7 @@ backend : place data_transform : support_trans_dtype : start, end, step + traits : paddle::dialect::ForwardOnlyTrait - op : assign args : (Tensor x) diff --git a/paddle/phi/ops/yaml/inconsistent/update_ops.yaml b/paddle/phi/ops/yaml/inconsistent/update_ops.yaml index f8972c5cab25a..9fcea7fe54ef9 100644 --- a/paddle/phi/ops/yaml/inconsistent/update_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/update_ops.yaml @@ -16,3 +16,4 @@ backend : place support_tensor : [start, end, step] interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait From 41bf100203e838454da44cbadabc7698d970e45a Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:10:11 +0800 Subject: [PATCH 077/135] fix (#68626) --- test/cpp/pir/pass/auto_layout_pass_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/test/cpp/pir/pass/auto_layout_pass_test.cc b/test/cpp/pir/pass/auto_layout_pass_test.cc index b8a2690b59454..ff0f7124b5804 100644 --- a/test/cpp/pir/pass/auto_layout_pass_test.cc +++ b/test/cpp/pir/pass/auto_layout_pass_test.cc @@ -29,7 +29,6 @@ #include "paddle/common/layout.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h" From 06f57c9b9cd3d1505d56184df442024d071c005a Mon Sep 17 00:00:00 2001 From: Miao Zhong <156628066+Micalling@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:17:48 +0800 Subject: [PATCH 078/135] =?UTF-8?q?=E3=80=90Hackathon=207th=20No.22?= =?UTF-8?q?=E3=80=91NO.22=20=E5=9C=A8=20paddle.audio.functional.get=5Fwind?= =?UTF-8?q?ow=20=E4=B8=AD=E6=94=AF=E6=8C=81=20bartlett=20=E3=80=81=20kaise?= =?UTF-8?q?r=20=E5=92=8C=20nuttall=20=E7=AA=97=E5=87=BD=E6=95=B0=20-part?= =?UTF-8?q?=20(#68268)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 【Hackathon 7th No.22】NO.22 在 paddle.audio.functional.get_window 中支持 bartlett 、 kaiser 和 nuttall 窗函数 * update * update * update --- python/paddle/audio/features/layers.py | 10 +- python/paddle/audio/functional/window.py | 63 +++++++++++- test/legacy_test/test_audio_functions.py | 19 +++- test/legacy_test/test_get_window.py | 120 +++++++++++++++++++++++ 4 files changed, 201 insertions(+), 11 deletions(-) create mode 100644 test/legacy_test/test_get_window.py diff --git a/python/paddle/audio/features/layers.py b/python/paddle/audio/features/layers.py index 5f72d27d854d5..1f578f072b8e5 100644 --- a/python/paddle/audio/features/layers.py +++ b/python/paddle/audio/features/layers.py @@ -31,6 +31,8 @@ 'hamming', 'hann', 'kaiser', + 'bartlett', + 'nuttall', 'gaussian', 'exponential', 'triang', @@ -50,7 +52,7 @@ class Spectrogram(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor', 'bartlett', 'kaiser', 'nuttall'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. @@ -135,7 +137,7 @@ class MelSpectrogram(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor', 'bartlett', 'kaiser', 'nuttall'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. @@ -242,7 +244,7 @@ class LogMelSpectrogram(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor', 'bartlett', 'kaiser', 'nuttall'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. @@ -350,7 +352,7 @@ class MFCC(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor', 'bartlett', 'kaiser', 'nuttall'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py index 22197ec192b44..2962cb0fbed0e 100644 --- a/python/paddle/audio/functional/window.py +++ b/python/paddle/audio/functional/window.py @@ -55,6 +55,61 @@ def _cat(x: list[Tensor], data_type: str) -> Tensor: return paddle.concat(l) +@window_function_register.register() +def _bartlett(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: + """ + Computes the Bartlett window. + This function is consistent with scipy.signal.windows.bartlett(). + """ + if _len_guards(M): + return paddle.ones((M,), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) + M = paddle.to_tensor(M, dtype=dtype) + w = paddle.where( + paddle.less_equal(n, (M - 1) / 2.0), + 2.0 * n / (M - 1), + 2.0 - 2.0 * n / (M - 1), + ) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _kaiser( + M: int, beta: float, sym: bool = True, dtype: str = 'float64' +) -> Tensor: + """Compute the Kaiser window. + This function is consistent with scipy.signal.windows.kaiser(). + """ + if _len_guards(M): + return paddle.ones((M,), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + beta = paddle.to_tensor(beta, dtype=dtype) + + n = paddle.arange(0, M, dtype=dtype) + M = paddle.to_tensor(M, dtype=dtype) + alpha = (M - 1) / 2.0 + w = paddle.i0( + beta * paddle.sqrt(1 - ((n - alpha) / alpha) ** 2.0) + ) / paddle.i0(beta) + + return _truncate(w, needs_trunc) + + +@window_function_register.register() +def _nuttall(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: + """Nuttall window. + This function is consistent with scipy.signal.windows.nuttall(). + """ + a = paddle.to_tensor( + [0.3635819, 0.4891775, 0.1365995, 0.0106411], dtype=dtype + ) + return _general_cosine(M, a=a, sym=sym, dtype=dtype) + + @window_function_register.register() def _acosh(x: Tensor | float) -> Tensor: if isinstance(x, float): @@ -347,7 +402,7 @@ def get_window( """Return a window of a given length and type. Args: - window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor', 'bartlett', 'kaiser', 'nuttall'. win_length (int): Number of samples. fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. dtype (str, optional): The data type of the return window. Defaults to 'float64'. @@ -364,17 +419,16 @@ def get_window( >>> cosine_window = paddle.audio.functional.get_window('cosine', n_fft) >>> std = 7 - >>> gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft) + >>> gaussian_window = paddle.audio.functional.get_window(('gaussian', std), n_fft) """ sym = not fftbins - args = () if isinstance(window, tuple): winstr = window[0] if len(window) > 1: args = window[1:] elif isinstance(window, str): - if window in ['gaussian', 'exponential']: + if window in ['gaussian', 'exponential', 'kaiser']: raise ValueError( "The '" + window + "' window needs one or " "more parameters -- pass a tuple." @@ -388,7 +442,6 @@ def get_window( winfunc = window_function_register.get('_' + winstr) except KeyError as e: raise ValueError("Unknown window type.") from e - params = (win_length, *args) kwargs = {'sym': sym} return winfunc(*params, dtype=dtype, **kwargs) diff --git a/test/legacy_test/test_audio_functions.py b/test/legacy_test/test_audio_functions.py index bac0828fc49d1..3804ae9dc381a 100644 --- a/test/legacy_test/test_audio_functions.py +++ b/test/legacy_test/test_audio_functions.py @@ -257,6 +257,7 @@ def test_gaussian_window_and_exception(self, n_fft: int): np.testing.assert_array_almost_equal( window_scipy_exp, window_paddle_exp.numpy(), decimal=5 ) + try: window_paddle = paddle.audio.functional.get_window("hann", -1) except ValueError: @@ -290,7 +291,14 @@ def dct(n_filters, n_input): np.testing.assert_array_almost_equal(librosa_dct, paddle_dct, decimal=5) @parameterize( - [128, 256, 512], ["hamming", "hann", "triang", "bohman"], [True, False] + [128, 256, 512], + [ + "hamming", + "hann", + "triang", + "bohman", + ], + [True, False], ) def test_stft_and_spect( self, n_fft: int, window_str: str, center_flag: bool @@ -345,7 +353,14 @@ def test_stft_and_spect( ) @parameterize( - [128, 256, 512], [64, 82], ["hamming", "hann", "triang", "bohman"] + [128, 256, 512], + [64, 82], + [ + "hamming", + "hann", + "triang", + "bohman", + ], ) def test_istft(self, n_fft: int, hop_length: int, window_str: str): if len(self.waveform.shape) == 2: # (C, T) diff --git a/test/legacy_test/test_get_window.py b/test/legacy_test/test_get_window.py new file mode 100644 index 0000000000000..189b45d257458 --- /dev/null +++ b/test/legacy_test/test_get_window.py @@ -0,0 +1,120 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +import unittest + +from parameterized import parameterized +from scipy import signal + +import paddle +import paddle.audio +from paddle.base import core + + +def parameterize(*params): + return parameterized.expand(list(itertools.product(*params))) + + +class TestAudioFuncitons(unittest.TestCase): + def setUp(self): + paddle.disable_static( + paddle.CUDAPlace(0) + if core.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + + @parameterize( + [ + "hamming", + "hann", + "triang", + "bohman", + "blackman", + "cosine", + "tukey", + "taylor", + "bartlett", + "nuttall", + ], + [1, 512], + ) + def test_window(self, window_type: str, n_fft: int): + window_scipy = signal.get_window(window_type, n_fft) + window_paddle = paddle.audio.functional.get_window(window_type, n_fft) + window_scipy = paddle.to_tensor(window_scipy, dtype=window_paddle.dtype) + paddle.allclose( + window_scipy, + window_paddle, + atol=0.0001, + rtol=0.0001, + ) + + @parameterize([1, 512]) + def test_window_and_exception(self, n_fft: int): + window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7) + window_paddle_gaussian = paddle.audio.functional.get_window( + ('gaussian', 7), n_fft, False + ) + window_scipy_gaussain = paddle.to_tensor( + window_scipy_gaussain, dtype=window_paddle_gaussian.dtype + ) + paddle.allclose( + window_scipy_gaussain, + window_paddle_gaussian, + atol=0.0001, + rtol=0.0001, + ) + + window_scipy_general_gaussain = signal.windows.general_gaussian( + n_fft, 1, 7 + ) + window_paddle_general_gaussian = paddle.audio.functional.get_window( + ('general_gaussian', 1, 7), n_fft, False + ) + window_scipy_general_gaussain = paddle.to_tensor( + window_scipy_general_gaussain, + dtype=window_paddle_general_gaussian.dtype, + ) + paddle.allclose( + window_scipy_gaussain, + window_paddle_gaussian, + atol=0.0001, + rtol=0.0001, + ) + + window_scipy_exp = signal.windows.exponential(n_fft) + window_paddle_exp = paddle.audio.functional.get_window( + ('exponential', None, 1), n_fft, False + ) + window_scipy_exp = paddle.to_tensor( + window_scipy_exp, dtype=window_paddle_exp.dtype + ) + paddle.allclose( + window_scipy_exp, window_paddle_exp, atol=0.0001, rtol=0.0001 + ) + + window_scipy_kaiser = signal.windows.kaiser(n_fft, beta=14.0) + window_paddle_kaiser = paddle.audio.functional.get_window( + ('kaiser', 14.0), n_fft + ) + window_scipy_kaiser = paddle.to_tensor( + window_scipy_kaiser, dtype=window_paddle_kaiser.dtype + ) + paddle.allclose( + window_scipy_kaiser, window_paddle_kaiser, atol=0.0001, rtol=0.0001 + ) + + +if __name__ == '__main__': + unittest.main() From f2106200ceb30963aa2ba3509c0e6ed614b045a9 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:36:43 +0800 Subject: [PATCH 079/135] Cherry pick some PRs from incubate/fleety (#68565) * Unify Paddle recall error (#68246) * unify recall error * fix use_paddle_recall_error * Refine Sharding Pad Zero Error Message (#68256) * Separate LossNaN and LossInf error (#68367) * split nan inf * fix fusedlinear bug * fix cast and compare * fix compile error * Fix FusedLinear GLOG core dump bug (#68428) * Fix index_put op wild-pointer error (#68518) * fix conflict * fix format * fix compilation error --- paddle/phi/infermeta/fusion.cc | 7 +- .../phi/kernels/funcs/blas/blaslt_impl.cu.h | 55 ++++------------ .../phi/kernels/funcs/blas/blaslt_impl.hip.h | 65 +++---------------- paddle/phi/kernels/funcs/index_put_utils.h | 10 ++- paddle/phi/kernels/gpu/cast_impl.h | 1 + .../phi/kernels/gpu/index_put_grad_kernel.cu | 9 +-- paddle/phi/kernels/gpu/index_put_kernel.cu | 3 +- .../phi/kernels/legacy/kps/compare_kernel.cu | 1 + .../dygraph_sharding_optimizer.py | 12 ++-- .../fleet/meta_parallel/pipeline_parallel.py | 2 +- .../pp_utils/p2p_communication.py | 11 ++-- .../fleet/utils/tensor_fusion_helper.py | 7 +- python/paddle/framework/recall_error.py | 30 +++++++++ 13 files changed, 94 insertions(+), 119 deletions(-) create mode 100644 python/paddle/framework/recall_error.py diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 4a4ebaeec5b01..fa814545c0c13 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -1943,8 +1943,11 @@ void FusedGemmEpilogueGradInferMeta(const MetaTensor& x, x_grad->set_dims(x_dims); x_grad->set_dtype(x.dtype()); } - y_grad->set_dims(y_dims); - y_grad->set_dtype(y.dtype()); + + if (y_grad) { + y_grad->set_dims(y_dims); + y_grad->set_dtype(y.dtype()); + } if (bias_grad) { int64_t dbias_dim = trans_y ? y_dims[0] : y_dims[1]; diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h index 547518bea1f74..eaa4da08a2680 100644 --- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h @@ -323,35 +323,6 @@ struct MatmulDescriptor { } } - std::string GetDescResultString(std::string prefix, - bool has_algo = true) const { - std::ostringstream out; - out << prefix << " \n"; -#define GET_DESC_DATA_STRING(src) \ - do { \ - out << " " << #src << " = ["; \ - int num = sizeof((*src)) / sizeof(src->data[0]); \ - for (int i = 0; i < num; ++i) { \ - if (i == 0) { \ - out << src->data[i]; \ - } else { \ - out << ", " << src->data[i]; \ - } \ - } \ - out << "]\n"; \ - } while (0); - - if (has_algo) { - GET_DESC_DATA_STRING(algo); - } - GET_DESC_DATA_STRING(x_desc); - GET_DESC_DATA_STRING(y_desc); - GET_DESC_DATA_STRING(out_desc); - GET_DESC_DATA_STRING(op_desc); -#undef GET_DESC_DATA_STRING - return out.str(); - } - void ExchangeXYDesc(bool no_exchange) {} protected: @@ -512,15 +483,14 @@ struct CublasLtBase { workspace->ptr(), workspace_size); MatmulDescT* best_desc = new MatmulDescT(*desc); - VLOG(6) << best_desc->GetDescResultString( - "[Searched CublasltDescriptor] "); + VLOG(6) << "[Searched CublasltDescriptor] "; auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); } } - VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] "); + VLOG(7) << "[Impl CublasltDescriptor] "; PADDLE_ENFORCE_GPU_SUCCESS( dynload::cublasLtMatmul(cublaslt_handle, desc->op_desc, @@ -706,8 +676,7 @@ struct CublasLtBase { workspace /*output parameter*/, workspace_size /*output parameter*/); MatmulDescriptor* best_desc = new MatmulDescriptor(*desc); - VLOG(6) << best_desc->GetDescResultString( - "[Searched CublasltDescriptor] "); + VLOG(6) << "[Searched CublasltDescriptor] "; auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); @@ -726,15 +695,14 @@ struct CublasLtBase { workspace->ptr(), workspace_size); MatmulDescriptor* best_desc = new MatmulDescriptor(*desc); - VLOG(6) << best_desc->GetDescResultString( - "[Searched CublasltDescriptor] "); + VLOG(6) << "[Searched CublasltDescriptor] "; auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); } } - VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] "); + VLOG(7) << "[Impl CublasltDescriptor] "; PADDLE_ENFORCE_GPU_SUCCESS( dynload::cublasLtMatmul(cublaslt_handle, desc->op_desc, @@ -1040,11 +1008,16 @@ struct DescriptorSetter { sub_key = planner->GenSubKey(); } - auto& matmul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); - if (matmul_cache.FindSubKey(sub_key)) { + bool has_cache = false; + if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune()) { + auto& matmul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); + has_cache = matmul_cache.FindSubKey(sub_key); + } + if (has_cache) { + auto& matmul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); desc = *(reinterpret_cast(matmul_cache.GetSubKey(sub_key))); desc.template SetFusedEpiloguePtr(planner); - VLOG(7) << desc.GetDescResultString("[Heap CublasltDescriptor] "); + VLOG(7) << "[Heap CublasltDescriptor] "; } else { desc.template Create(M, N, @@ -1061,7 +1034,7 @@ struct DescriptorSetter { if (planner != nullptr) { desc.template SetFusedEpiloguePtr(planner); } - VLOG(7) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false); + VLOG(7) << "[Stack CublasltDescriptor] "; } } }; diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h index d3e318ef76827..a6548878dd042 100644 --- a/paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h @@ -322,50 +322,6 @@ struct MatmulDescriptor { } } - std::string GetDescResultString(std::string prefix, - bool has_algo = true) const { - std::ostringstream out; - out << prefix << " \n"; -#define GET_DESC_DATA_STRING(src) \ - do { \ - out << " " << #src << " = ["; \ - int num = sizeof((*src)) / sizeof(src->data[0]); \ - for (int i = 0; i < num; ++i) { \ - if (i == 0) { \ - out << src->data[i]; \ - } else { \ - out << ", " << src->data[i]; \ - } \ - } \ - out << "]\n"; \ - } while (0); - -#define GET_ALGO_DATA_STRING(algo) \ - do { \ - out << " " << #algo << " = ["; \ - for (int i = 0; i < 16; ++i) { \ - if (i == 0) { \ - out << static_cast(algo->data[i]); \ - } else { \ - out << ", " << static_cast(algo->data[i]); \ - } \ - } \ - out << ", max_workspace_bytes: " << algo->max_workspace_bytes; \ - out << "]\n"; \ - } while (0); - - if (has_algo) { - GET_ALGO_DATA_STRING(algo); - } - GET_DESC_DATA_STRING(x_desc); - GET_DESC_DATA_STRING(y_desc); - GET_DESC_DATA_STRING(out_desc); - GET_DESC_DATA_STRING(op_desc); -#undef GET_DESC_DATA_STRING -#undef GET_ALGO_DATA_STRING - return out.str(); - } - void ExchangeXYDesc(bool no_exchange) {} protected: @@ -540,8 +496,7 @@ struct CublasLtBase { workspace->ptr(), workspace_size); MatmulDescT* best_desc = new MatmulDescT(*desc); - VLOG(6) << best_desc->GetDescResultString( - "[Searched HipblasltDescriptor] "); + VLOG(6) << "[Searched HipblasltDescriptor] "; auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); @@ -576,13 +531,12 @@ struct CublasLtBase { *algo = heuristic_results.algo; PADDLE_ENFORCE_GPU_SUCCESS( dynload::hipblasLtMatmulPreferenceDestroy(preference)); - VLOG(4) << desc->GetDescResultString( - "[Searched Single HipblasltDescriptor] "); + VLOG(4) << "[Searched Single HipblasltDescriptor] "; } VLOG(4) << "CublasLtBase<> doesn't searched"; } - VLOG(4) << desc->GetDescResultString("[Impl HipblasltDescriptor] "); + VLOG(4) << "[Impl HipblasltDescriptor] "; PADDLE_ENFORCE_GPU_SUCCESS( dynload::hipblasLtMatmul(hipblaslt_handle, desc->op_desc, @@ -768,8 +722,7 @@ struct CublasLtBase { workspace /*output parameter*/, workspace_size /*output parameter*/); MatmulDescriptor* best_desc = new MatmulDescriptor(*desc); - VLOG(6) << best_desc->GetDescResultString( - "[Searched CublasltDescriptor] "); + VLOG(6) << "[Searched CublasltDescriptor] "; auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); @@ -788,15 +741,14 @@ struct CublasLtBase { workspace->ptr(), workspace_size); MatmulDescriptor* best_desc = new MatmulDescriptor(*desc); - VLOG(6) << best_desc->GetDescResultString( - "[Searched HipblasltDescriptor] "); + VLOG(6) << "[Searched HipblasltDescriptor] "; auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); } } - VLOG(7) << desc->GetDescResultString("[Impl HipblasltDescriptor] "); + VLOG(7) << "[Impl HipblasltDescriptor] "; PADDLE_ENFORCE_GPU_SUCCESS( dynload::hipblasLtMatmul(hipblaslt_handle, desc->op_desc, @@ -1054,7 +1006,7 @@ struct DescriptorSetter { if (matmul_cache.FindSubKey(sub_key)) { desc = *(reinterpret_cast(matmul_cache.GetSubKey(sub_key))); desc.template SetFusedEpiloguePtr(planner); - VLOG(7) << desc.GetDescResultString("[Heap HipblasltDescriptor] "); + VLOG(7) << "[Heap HipblasltDescriptor] "; } else { desc.template Create(M, N, @@ -1071,8 +1023,7 @@ struct DescriptorSetter { if (planner != nullptr) { desc.template SetFusedEpiloguePtr(planner); } - VLOG(7) << desc.GetDescResultString("[Stack HipblasltDescriptor] ", - false); + VLOG(7) << "[Stack HipblasltDescriptor] "; } } }; diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h index e82068cb3d847..3bfb9ddc87aa6 100644 --- a/paddle/phi/kernels/funcs/index_put_utils.h +++ b/paddle/phi/kernels/funcs/index_put_utils.h @@ -184,12 +184,18 @@ static phi::DDim BroadCastTensorsDims( template T** GetDevicePointerArray(const Context& ctx, - const std::vector& indices_v) { + const std::vector& indices_v, + phi::Allocator::AllocationPtr* holder_ptr) { + PADDLE_ENFORCE_NOT_NULL( + holder_ptr, + phi::errors::InvalidArgument( + "hold_ptr should be provided when calling GetDevicePointerArray.")); std::vector h_indices_v(indices_v.size()); for (size_t i = 0; i < indices_v.size(); ++i) { h_indices_v[i] = indices_v[i]->data(); } - auto d_indices_data = phi::memory_utils::Alloc( + auto& d_indices_data = *holder_ptr; + d_indices_data = phi::memory_utils::Alloc( ctx.GetPlace(), h_indices_v.size() * sizeof(T*), phi::Stream(reinterpret_cast(ctx.stream()))); diff --git a/paddle/phi/kernels/gpu/cast_impl.h b/paddle/phi/kernels/gpu/cast_impl.h index 06ea1b1ab7cd3..e40bc30da1d98 100644 --- a/paddle/phi/kernels/gpu/cast_impl.h +++ b/paddle/phi/kernels/gpu/cast_impl.h @@ -37,6 +37,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, outputs.emplace_back(out); dev_ctx.Alloc(out); out->set_type(out_dtype); + if (out->numel() == 0) return; phi::funcs::ElementwiseKernel( dev_ctx, inputs, &outputs, CastFunctor()); } diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu index 605838f510fbf..07ecfd4ac2846 100644 --- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu @@ -96,6 +96,7 @@ void LaunchIndexPutGradCudaKernel( const bool accumulate, DenseTensor* value_grad, DenseTensor* x_grad) { + phi::Allocator::AllocationPtr indices_holder_1, indices_holder_2; if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); if (!accumulate) { @@ -112,8 +113,8 @@ void LaunchIndexPutGradCudaKernel( } const int64_t numel = indices[0]->numel(); - auto pd_indices = - funcs::GetDevicePointerArray(dev_ctx, indices); + auto pd_indices = funcs::GetDevicePointerArray( + dev_ctx, indices, &indices_holder_1); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); SetZeroCudaKernel<<numel(); - auto pd_indices = - funcs::GetDevicePointerArray(dev_ctx, indices); + auto pd_indices = funcs::GetDevicePointerArray( + dev_ctx, indices, &indices_holder_2); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); if (value_grad) { diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu index 17b5b87420c8c..ea7440fabfce4 100644 --- a/paddle/phi/kernels/gpu/index_put_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_kernel.cu @@ -89,8 +89,9 @@ void LaunchIndexPutCudaKernel(const Context& dev_ctx, int64_t is_single_val_tensor = (value.numel() == 1) ? 0 : INT64_MAX; const int64_t numel = indices[0]->numel(); + phi::Allocator::AllocationPtr holder; auto pd_indices = - funcs::GetDevicePointerArray(dev_ctx, indices); + funcs::GetDevicePointerArray(dev_ctx, indices, &holder); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); IndexPutCudaKernel diff --git a/paddle/phi/kernels/legacy/kps/compare_kernel.cu b/paddle/phi/kernels/legacy/kps/compare_kernel.cu index 429cff41886a1..549a92508c129 100644 --- a/paddle/phi/kernels/legacy/kps/compare_kernel.cu +++ b/paddle/phi/kernels/legacy/kps/compare_kernel.cu @@ -40,6 +40,7 @@ inline void CompareRawKernelImpl(const Context& ctx, DenseTensor* out) { ctx.template Alloc(out); out->set_type(phi::DataType::BOOL); + if (out->numel() == 0) return; std::vector ins{&x, &y}; std::vector outs{out}; funcs::BroadcastKernel(ctx, ins, &outs, Functor(), axis); diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py index a6f5d4b9e6af9..461c98c77e308 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py @@ -26,6 +26,10 @@ ReduceOp, is_avg_reduce_op_supported, ) +from paddle.framework.recall_error import ( + SHARDING_PAD_NON_ZERO_ERROR, + check_naninf, +) from paddle.utils import strtobool from ...utils import timer_helper as timer @@ -344,10 +348,10 @@ def reduce_gradients(self, parameter_list, hcg): os.getenv('FLAGS_pp_check_naninf', '0') ) if need_check: - naninf = paddle.isfinite(g_var).all() - if not naninf.item(): + err_msg = check_naninf(g_var) + if err_msg is not None: raise ValueError( - f"CUDA error(1002). Tensor contains inf or nan values at rank {paddle.distributed.get_rank()} before gradient communication" + f"{err_msg}. Tensor contains inf or nan values at rank {paddle.distributed.get_rank()} before gradient communication" ) paddle.distributed.reduce( @@ -839,7 +843,7 @@ def _check_padding_zero(self): if pad_tensor is not None: assert paddle.all( pad_tensor == 0 - ).item(), f"CUDA error(1003). The padding of Tensor {k} is not zero" + ).item(), f"{SHARDING_PAD_NON_ZERO_ERROR}. The padding of Tensor {k} is not zero" if self._enable_timer: self.timers("check-padding-zero").stop() diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 5f8f512d384bb..6e48f7c769b2f 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -135,7 +135,7 @@ def _load_micro_batch_impl(self, inputs, micro_step): assert ( len(inputs) == self._acc_steps ), "length of data should be %d, but it is %d" % ( - self.accumulate_steps, + self._acc_steps, len(inputs), ) return inputs[micro_step].detach() diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index 75f11f03bd79c..5a5747096347f 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -24,6 +24,7 @@ _get_global_group, _warn_cur_rank_not_in_group, ) +from paddle.framework.recall_error import check_naninf from paddle.utils import strtobool from ...utils import timer_helper as timer @@ -294,9 +295,10 @@ def batch_send_recv_on_calc_stream(p2p_op_list): if need_check: for p2p_op in p2p_op_list: if p2p_op.op == _send_on_calc_stream: - if not paddle.isfinite(p2p_op.tensor).all().item(): + err_msg = check_naninf(p2p_op.tensor) + if err_msg is not None: raise ValueError( - f"CUDA error(1002). Tensor contains inf or nan values at rank {paddle.distributed.get_rank()}" + f"{err_msg}. Tensor contains inf or nan values at rank {paddle.distributed.get_rank()}" ) group = _get_global_group() if group is None else group @@ -475,9 +477,10 @@ def _p2p_ops_tuple_or_tensor(tensors, p2p_func, pp_rank, pp_group): if need_check: if p2p_func == paddle.distributed.isend: for t in tensors: - if not paddle.isfinite(t).all().item(): + err_msg = check_naninf(t) + if err_msg is not None: raise ValueError( - f"CUDA error(1002). Tensor contains inf or nan values at rank {paddle.distributed.get_rank()}" + f"{err_msg}. Tensor contains inf or nan values at rank {paddle.distributed.get_rank()}" ) reqs = [] diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index 80c69dd87a41a..1c9bde0e7c6b1 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -26,6 +26,7 @@ base as imperative_base, core, ) +from paddle.framework.recall_error import check_naninf from .log_util import logger @@ -663,10 +664,10 @@ def _comm_grads(self): need_check = strtobool(os.getenv('FLAGS_pp_check_naninf', '0')) if need_check: - naninf = paddle.isfinite(self.grad_storage).all() - if not naninf.item(): + err_msg = check_naninf(self.grad_storage) + if err_msg is not None: raise ValueError( - f"CUDA error(1002). Tensor contains inf or nan values at rank {paddle.distributed.get_rank()} before gradient communication" + f"{err_msg}. Tensor contains inf or nan values at rank {paddle.distributed.get_rank()} before gradient communication" ) if self._act == HOOK_ACTION.ALL_REDUCE: diff --git a/python/paddle/framework/recall_error.py b/python/paddle/framework/recall_error.py new file mode 100644 index 0000000000000..f672b28792cbb --- /dev/null +++ b/python/paddle/framework/recall_error.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import paddle + +AADIFF_ERROR = "PaddleRecall error(101): AAdiff" +LOSS_NAN_ERROR = "PaddleRecall error(102): LossNan" +SHARDING_PAD_NON_ZERO_ERROR = "PaddleRecall error(103): ShardingPadNonZero" +LOSS_INF_ERROR = "PaddleRecall error(104): LossInf" + + +def check_naninf(tensor): + if paddle.isfinite(tensor).all().item(): + return None + elif paddle.isnan(tensor).any().item(): + return LOSS_NAN_ERROR + else: + return LOSS_INF_ERROR From d41062125fd253d4f73561d114ec6346fec94a8c Mon Sep 17 00:00:00 2001 From: BiynXu <62832681+BiynXu@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:51:16 +0800 Subject: [PATCH 080/135] [CINN] fix warp reduce (#68605) --- paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh index ec8f063bc0a73..a6c0d9e84db65 100644 --- a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh +++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh @@ -463,7 +463,8 @@ __device__ inline bool cinn_any(const bool left, const bool right) { return left #define CINN_WARP_SHUFFLE_INTERNAL_IMPL(REDUCE_TYPE, INITIAL_VALUE, DTYPE) \ __device__ inline DTYPE cinn_warp_shuffle_##REDUCE_TYPE##_internal(const DTYPE value) { \ DTYPE tmp_val = value, shfl_res; \ - unsigned int mask = __activemask(); \ + unsigned int tmp_mask = ((unsigned long)1 << (blockDim.x < 32 ? blockDim.x : 32)) - 1;\ + unsigned int mask = __activemask() & tmp_mask; \ unsigned int lane = __popc(mask); \ if (lane < 32) { \ CINN_SHUFFLE_FUNCTION(16, cinn_##REDUCE_TYPE, (DTYPE)(INITIAL_VALUE)) \ From 418fcf1d970646145d5ed25514da79837f9936b7 Mon Sep 17 00:00:00 2001 From: Shuhao Liang <50269654+lshpku@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:53:39 +0800 Subject: [PATCH 081/135] [CINN] Add the ReplaceCrossBlockReduction backend pass (#68364) --- paddle/cinn/hlir/pe/reduction.cc | 25 ++ paddle/cinn/hlir/pe/reduction.h | 3 + paddle/cinn/ir/lowered_func.h | 26 ++ paddle/cinn/optim/CMakeLists.txt | 1 + paddle/cinn/optim/optimize.cc | 4 + .../optim/replace_cross_block_reduction.cc | 310 ++++++++++++++++++ .../optim/replace_cross_block_reduction.h | 58 ++++ .../runtime/cuda/cinn_cuda_runtime_source.cuh | 46 +++ .../runtime/cuda/cuda_intrinsics_reduce.cc | 22 ++ test/cpp/pir/cinn/CMakeLists.txt | 6 +- .../replace_cross_block_reduction_test.cc | 202 ++++++++++++ 11 files changed, 702 insertions(+), 1 deletion(-) create mode 100644 paddle/cinn/optim/replace_cross_block_reduction.cc create mode 100644 paddle/cinn/optim/replace_cross_block_reduction.h create mode 100644 test/cpp/pir/cinn/replace_cross_block_reduction_test.cc diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc index ea351907c11e7..eb9df516ac513 100644 --- a/paddle/cinn/hlir/pe/reduction.cc +++ b/paddle/cinn/hlir/pe/reduction.cc @@ -1205,6 +1205,31 @@ std::string DiscreteReduceExternalFuncName(const ir::Expr& op, return ""; } +std::string GridReduceExternalFuncName(const ir::Expr& op, + const cinn::common::Type type) { + if (op.As()) { + if (type.is_bool()) { + return "cinn_grid_reduce_any"; + } + return "cinn_grid_reduce_sum" + Type2StrForReduce(type); + } else if (op.As()) { + if (type.is_bool()) { + return "cinn_grid_reduce_all"; + } + return "cinn_grid_reduce_prod" + Type2StrForReduce(type); + } else if (op.As()) { + return "cinn_grid_reduce_max" + Type2StrForReduce(type); + } else if (op.As()) { + return "cinn_grid_reduce_min" + Type2StrForReduce(type); + } else if (op.As()) { + return "cinn_grid_reduce_all"; + } else if (op.As()) { + return "cinn_grid_reduce_any"; + } + PADDLE_THROW(::common::errors::InvalidArgument( + "No matching grid reduce template for op: %s, type: %s", op, type)); +} + } // namespace pe } // namespace hlir } // namespace cinn diff --git a/paddle/cinn/hlir/pe/reduction.h b/paddle/cinn/hlir/pe/reduction.h index 2037e98e11622..32745a5035df2 100644 --- a/paddle/cinn/hlir/pe/reduction.h +++ b/paddle/cinn/hlir/pe/reduction.h @@ -474,6 +474,9 @@ std::string CrossThreadReduceExternalFuncName(const ir::Expr& op, std::string DiscreteReduceExternalFuncName(const ir::Expr& op, const ir::Expr& tensor); +std::string GridReduceExternalFuncName(const ir::Expr& op, + const cinn::common::Type type); + std::string Type2StrForReduce(cinn::common::Type type); } // namespace pe } // namespace hlir diff --git a/paddle/cinn/ir/lowered_func.h b/paddle/cinn/ir/lowered_func.h index ba94c8410cf08..224304bbdb23e 100644 --- a/paddle/cinn/ir/lowered_func.h +++ b/paddle/cinn/ir/lowered_func.h @@ -113,6 +113,28 @@ struct CudaAxisInfo { std::ostream& operator<<(std::ostream& os, const CudaAxisInfo& x); +/** + * A struct representing a temporary global buffer (allocated on the heap) that + * is used as staging space during kernel execution. + */ +struct TempSpaceInfo { + TempSpaceInfo() = default; + TempSpaceInfo(const Expr& size, int arg_idx, bool need_zero_init = false) + : size_(size), arg_idx_(arg_idx), need_zero_init_(need_zero_init) {} + + Expr size() const { return size_; } + int arg_idx() const { return arg_idx_; } + bool need_zero_init() const { return need_zero_init_; } + + private: + // size of the space in bytes + Expr size_; + // index in the function's argument list + int arg_idx_; + // whether this space need to be zero-initialized + bool need_zero_init_; +}; + /** * Definition of a lowered function. Note that, it should be functional. * @@ -131,6 +153,10 @@ struct _LoweredFunc_ : ExprNode<_LoweredFunc_> { //! function's argument list, but will be used in the body. std::vector temp_bufs; + //! Temporary global buffers. These buffers will appear in the function's + //! argument list. + std::vector temp_spaces; + //! Body of this function. Expr body; diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt index 899f6ef18badd..e5cee4f5a5cda 100755 --- a/paddle/cinn/optim/CMakeLists.txt +++ b/paddle/cinn/optim/CMakeLists.txt @@ -24,6 +24,7 @@ gather_srcs( cast_bool_to_int8.cc var_mod_simplify.cc remove_schedule_block.cc + replace_cross_block_reduction.cc replace_cross_thread_reduction.cc replace_mod_to_max.cc resize_buffer.cc diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc index 98e2f70c8eb37..d757cd9d9114b 100644 --- a/paddle/cinn/optim/optimize.cc +++ b/paddle/cinn/optim/optimize.cc @@ -31,6 +31,7 @@ #include "paddle/cinn/optim/rearrange_load_instruction.h" #include "paddle/cinn/optim/remove_schedule_block.h" #include "paddle/cinn/optim/replace_const_param_to_integer.h" +#include "paddle/cinn/optim/replace_cross_block_reduction.h" #include "paddle/cinn/optim/replace_cross_thread_reduction.h" #include "paddle/cinn/optim/trans_buffer_with_dynamic_shape.h" #include "paddle/cinn/optim/transform_gpu_forloop.h" @@ -59,6 +60,9 @@ Expr Optimize(Expr e, // Simplify already contains CastSimplify Simplify(&copied); ReplaceCrossThreadReduction(&copied); + VLOG(4) << "After Optimize ReplaceCrossThreadReduction:" << copied; + ReplaceCrossBlockReduction(&copied); + VLOG(4) << "After Optimize ReplaceCrossBlockReduction:" << copied; UnrollLoop(&copied); VLOG(4) << "After Optimize UnrollLoop:" << copied; diff --git a/paddle/cinn/optim/replace_cross_block_reduction.cc b/paddle/cinn/optim/replace_cross_block_reduction.cc new file mode 100644 index 0000000000000..8cbe3e1191780 --- /dev/null +++ b/paddle/cinn/optim/replace_cross_block_reduction.cc @@ -0,0 +1,310 @@ +// Copyright (c) 2024 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/optim/replace_cross_block_reduction.h" +#include + +#include "paddle/cinn/adt/adt.h" +#include "paddle/cinn/common/common.h" +#include "paddle/cinn/hlir/pe/reduction.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_mutator.h" +#include "paddle/cinn/ir/schedule/ir_schedule_util.h" +#include "paddle/cinn/lang/compute.h" + +namespace cinn { +namespace optim { +namespace { + +ir::Expr CalcBufferSizeInBytes(const ir::Buffer& buffer) { + const ir::Expr numel = buffer->SymbolicNumel(); + return common::AutoSimplify(numel * buffer->dtype.bytes()); +} + +std::unordered_set GetReduceVarNames( + const ir::ScheduleBlockRealize* block_realize) { + const ir::ScheduleBlock* schedule_block = + block_realize->schedule_block.As(); + const std::vector& iter_values = block_realize->iter_values; + const std::vector& iter_vars = schedule_block->iter_vars; + + std::unordered_set reduce_var_names; + for (int i = 0; i < iter_values.size(); ++i) { + if (!iter_vars[i]->is_reduce_axis) { + continue; + } + ir::ir_utils::CollectIRNodesWithoutTensor( + iter_values[i], [&](const ir::Expr* x) { + if (x->as_var()) { + reduce_var_names.insert(x->as_var()->name); + } + return false; + }); + } + return reduce_var_names; +} + +ir::Expr GetRightOperand(const ir::Expr& expr) { +#define GET_RIGHT_OPERAND(OpT) \ + if (expr.As()) { \ + return expr.As()->b(); \ + } + + GET_RIGHT_OPERAND(ir::Add); + GET_RIGHT_OPERAND(ir::Mul); + GET_RIGHT_OPERAND(ir::Max); + GET_RIGHT_OPERAND(ir::Min); + GET_RIGHT_OPERAND(ir::And); + GET_RIGHT_OPERAND(ir::Or); + +#undef GET_RIGHT_OPERAND + PADDLE_THROW( + ::common::errors::InvalidArgument("Not a supported reduce op: %s", expr)); +} + +struct CrossBlockReductionReplacer : public ir::IRMutator<> { + void operator()(ir::Expr* expr) { Visit(expr); } + + private: + bool IsGridReduce(const ir::ScheduleBlockRealize* block_realize) { + if (cur_loops_.empty()) { + return false; + } + auto* innermost_loop = cur_loops_.back(); + if (!innermost_loop->is_gpu_block_binded()) { + return false; + } + const std::unordered_set reduce_var_names = + GetReduceVarNames(block_realize); + return reduce_var_names.count(innermost_loop->loop_var->name) > 0; + } + + void ConvertHeapBuffersToFuncArgs(ir::_LoweredFunc_* func_node) { + std::vector global_bufs; + std::vector local_bufs; + + for (auto& buf : func_node->temp_bufs) { + if (buf->memory_type == ir::MemoryType::Heap) { + global_bufs.push_back(buf); + } else { + local_bufs.push_back(buf); + } + } + + PADDLE_ENFORCE_LE(global_bufs.size(), + 1UL, + ::common::errors::PreconditionNotMet( + "Currently supports at most one global buffer.")); + + for (auto& buf : global_bufs) { + func_node->temp_spaces.emplace_back( + CalcBufferSizeInBytes(buf), /* arg_idx= */ func_node->args.size()); + func_node->args.emplace_back(buf, ir::Argument::IO::kOutput); + } + func_node->temp_bufs = local_bufs; + } + + ir::Tensor CreateLastBlockDoneTensor() { + if (is_done_tensor_.defined()) { + return is_done_tensor_; + } + const std::string name = "is_last_block_done"; + const std::vector shape = {}; + is_done_tensor_ = ir::_Tensor_::Make(name, common::Bool(), shape, shape); + is_done_tensor_->WithBuffer("local", "_" + name + "_temp_buffer"); + return is_done_tensor_; + } + + ir::Expr GetBlockBindedSpatialLoopExtend( + const ir::ScheduleBlockRealize* block_realize) { + const std::unordered_set reduce_var_names = + GetReduceVarNames(block_realize); + std::vector loop_extends; + for (auto* for_node : cur_loops_) { + if (reduce_var_names.count(for_node->loop_var->name) == 0 && + for_node->is_gpu_block_binded()) { + loop_extends.push_back(for_node->extent); + } + } + PADDLE_ENFORCE_EQ( + loop_extends.size(), + 1UL, + ::common::errors::PreconditionNotMet( + "There should be exactly one spatial loop binded on gpu block.")); + return loop_extends[0]; + } + + ir::Expr GetThreadBindedSpatialLoopExtend( + const ir::ScheduleBlockRealize* block_realize) { + const std::unordered_set reduce_var_names = + GetReduceVarNames(block_realize); + std::vector loop_extends; + for (auto* for_node : cur_loops_) { + if (reduce_var_names.count(for_node->loop_var->name) == 0 && + for_node->is_gpu_thread_binded()) { + loop_extends.push_back(for_node->extent); + } + } + PADDLE_ENFORCE_LE( + loop_extends.size(), + 1UL, + ::common::errors::PreconditionNotMet( + "There could be at most one spatial loop binded on gpu thread.")); + if (loop_extends.empty()) { + return ir::Expr(1); + } + return loop_extends[0]; + } + + ir::Expr CreateSemaphoreUpdateStmt( + const std::vector& semaphore_shape) { + const std::string name = "semaphore"; + ir::Tensor semaphore = ir::_Tensor_::Make( + name, common::Int(32), semaphore_shape, semaphore_shape); + semaphore->WithBuffer("global", "_" + name); + semaphore_buffer_ = semaphore->buffer; + ir::Expr update_semaphore = + lang::CallExtern("cinn_grid_reduce_update_semaphore", {semaphore}); + ir::Tensor is_done = CreateLastBlockDoneTensor(); + return ir::Store::Make(is_done, update_semaphore, /* indices= */ {}); + } + + ir::Expr WrapInLastBlockDone(ir::Expr* op) { + ir::Tensor is_done = CreateLastBlockDoneTensor(); + ir::Expr load_is_done = ir::Load::Make(is_done, /* indices= */ {}); + return ir::IfThenElse::Make(load_is_done, *op); + } + + void ReplaceByGridReduceExternCall(const ir::ScheduleBlock* schedule_block, + const ir::Expr num_spatial_threads) { + ir::Expr update_stmt = schedule_block->body; + if (update_stmt.As()) { + PADDLE_ENFORCE_EQ( + update_stmt.As()->stmts.size(), + 1UL, + ::common::errors::InvalidArgument( + "There should be exactly one statment inside schedule_block.")); + update_stmt = update_stmt.As()->stmts[0]; + } + PADDLE_ENFORCE_NOT_NULL( + update_stmt.As(), + ::common::errors::InvalidArgument( + "The top-level statement in schedule_block must be a store.")); + + auto* store_node = update_stmt.As(); + ir::Expr rvalue = GetRightOperand(store_node->value); + PADDLE_ENFORCE_NOT_NULL(rvalue.As(), + ::common::errors::InvalidArgument( + "The rvalue of reduce is not a load.")); + + std::string func_name = hlir::pe::GridReduceExternalFuncName( + store_node->value, store_node->tensor->type()); + ir::Tensor rf_tensor = rvalue.As()->tensor.as_tensor_ref(); + store_node->value = + lang::CallExtern(func_name, {rf_tensor, num_spatial_threads}); + } + + void Visit(const ir::_LoweredFunc_* expr, ir::Expr* op) override { + is_after_grid_reduce_ = false; + func_arg_buffer_names_.clear(); + for (auto& arg : expr->args) { + if (arg.is_buffer()) { + func_arg_buffer_names_.insert(arg.buffer_arg()->name); + } + } + + IRMutator::Visit(expr, op); + if (!is_after_grid_reduce_) { + return; + } + + ir::_LoweredFunc_* func_node = op->As(); + ConvertHeapBuffersToFuncArgs(func_node); + + func_node->temp_bufs.push_back(is_done_tensor_->buffer); + func_node->temp_spaces.emplace_back( + CalcBufferSizeInBytes(semaphore_buffer_), + /* arg_idx= */ func_node->args.size(), + /* need_zero_init = */ true); + func_node->args.emplace_back(semaphore_buffer_, ir::Argument::IO::kOutput); + } + + void Visit(const ir::ScheduleBlockRealize* expr, ir::Expr* op) override { + const ir::ScheduleBlock* schedule_block = + expr->schedule_block.As(); + + if (schedule_block->name.substr(0, 4) == "root") { + IRMutator::Visit(expr, op); + return; + } + + if (!IsGridReduce(expr)) { + if (is_after_grid_reduce_) { + *op = WrapInLastBlockDone(op); + } + return; + } + + PADDLE_ENFORCE_EQ( + is_after_grid_reduce_, + false, + ::common::errors::PreconditionNotMet( + "Currently supports only one reduce in a fusion group.")); + is_after_grid_reduce_ = true; + + ir::Expr num_spatial_threads = GetThreadBindedSpatialLoopExtend(expr); + ReplaceByGridReduceExternCall(schedule_block, num_spatial_threads); + + ir::Expr num_spatial_blocks = GetBlockBindedSpatialLoopExtend(expr); + ir::Expr semaphore_update = CreateSemaphoreUpdateStmt({num_spatial_blocks}); + cur_parent_block_stmts_.push_back(semaphore_update); + *op = WrapInLastBlockDone(op); + } + + void Visit(const ir::For* expr, ir::Expr* op) override { + cur_loops_.push_back(expr); + IRMutator::Visit(expr, op); + cur_loops_.pop_back(); + } + + void Visit(const ir::Block* block, ir::Expr* op) override { + // We override the Block visitor to facilitate statement insertion. + std::vector old_parent_block_stmts; + old_parent_block_stmts.swap(cur_parent_block_stmts_); + auto* node = op->As(); + for (auto& stmt : node->stmts) { + IRMutator::Visit(&stmt, &stmt); + cur_parent_block_stmts_.push_back(stmt); + } + node->stmts = std::move(cur_parent_block_stmts_); + cur_parent_block_stmts_ = std::move(old_parent_block_stmts); + } + + void Visit(ir::Expr* expr) { IRMutator::Visit(expr, expr); } + + private: + std::vector cur_loops_; + std::vector cur_parent_block_stmts_; + std::unordered_set func_arg_buffer_names_; + ir::Tensor is_done_tensor_; + ir::Buffer semaphore_buffer_; + bool is_after_grid_reduce_{false}; +}; + +} // namespace + +void ReplaceCrossBlockReduction(Expr* e) { CrossBlockReductionReplacer()(e); } + +} // namespace optim +} // namespace cinn diff --git a/paddle/cinn/optim/replace_cross_block_reduction.h b/paddle/cinn/optim/replace_cross_block_reduction.h new file mode 100644 index 0000000000000..a78b67368fbcd --- /dev/null +++ b/paddle/cinn/optim/replace_cross_block_reduction.h @@ -0,0 +1,58 @@ +// Copyright (c) 2024 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/cinn/common/common.h" +#include "paddle/cinn/ir/ir.h" + +namespace cinn { +namespace optim { + +/** + * This pass handles the cross-block reduction properly. + * + * Specific transformations: + * 1. Replaces the cross-block reduction with an external call to the + * `grid_reduce` template function. + * 2. Adds a condition check `is_last_block_done` to the reduction operation + * and all subsequent schedule blocks. + * 3. Pushes global buffers (`rf` and `semaphore`) to the function’s argument + * list. + * + * Example: + * + * function reduce_sum (..., var_1) + * { + * thread_bind[blockIdx.x] for (i, 0, 16): + * thread_bind[blockIdx.y] for (j, 0, 8): // reduce axis + * var_1[i] += var_1_rf[j, i] + * } + * + * After pass: + * + * function reduce_sum (..., var_1, var_1_rf, semaphore) + * { + * thread_bind[blockIdx.x] for (i, 0, 16): + * thread_bind[blockIdx.y] for (j, 0, 8): // reduce axis + * is_last_block_done = update_semaphore(semaphore) + * if (is_last_block_done): + * var_1[i] = grid_reduce_sum(var_1_rf) + * } + */ +void ReplaceCrossBlockReduction(Expr* e); + +} // namespace optim +} // namespace cinn diff --git a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh index a6c0d9e84db65..18a7e1af4e9b4 100644 --- a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh +++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh @@ -720,6 +720,52 @@ EXPAND_REDUCE_FP16_MACRO(CINN_BLOCK_REDUCE_IMPL) #undef CINN_BLOCK_REDUCE_IMPL +#define CINN_GRID_REDUCE_IMPL(REDUCE_TYPE, init_value, DTYPE) \ + __shared__ DTYPE tmp_val; \ + int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; \ + if (tid < spatial_threads) { \ + tmp_val = init_value; \ + for (int y = 0; y < gridDim.y; y++) { \ + tmp_val = cinn_##REDUCE_TYPE(tmp_val, mem[(y * gridDim.x + blockIdx.x) * spatial_threads + tid]); \ + } \ + } \ + __syncthreads(); \ + return tmp_val; + +#define CINN_GRID_REDUCE_MACRO(REDUCE_TYPE, INITIAL_VALUE, DTYPE) \ + __device__ inline DTYPE cinn_grid_reduce_##REDUCE_TYPE(const DTYPE* mem, int spatial_threads) { \ + CINN_GRID_REDUCE_IMPL(REDUCE_TYPE, (DTYPE)(INITIAL_VALUE), DTYPE); \ + } + +EXPAND_REDUCE_INT32_MARCO(CINN_GRID_REDUCE_MACRO) +EXPAND_REDUCE_INT64_MARCO(CINN_GRID_REDUCE_MACRO) +EXPAND_REDUCE_FP32_MACRO(CINN_GRID_REDUCE_MACRO) +EXPAND_REDUCE_FP64_MACRO(CINN_GRID_REDUCE_MACRO) +EXPAND_REDUCE_BOOL_MACRO(CINN_GRID_REDUCE_MACRO) + +#ifdef CINN_CUDA_BF16 +EXPAND_REDUCE_BF16_MACRO(CINN_GRID_REDUCE_MACRO) +#endif + +#ifdef CINN_CUDA_FP16 +EXPAND_REDUCE_FP16_MACRO(CINN_GRID_REDUCE_MACRO) +#endif + +#undef CINN_GRID_REDUCE_IMPL +#undef CINN_GRID_REDUCE_MACRO + +__device__ inline bool cinn_grid_reduce_update_semaphore(int *semaphores) { + __shared__ bool done; + __threadfence(); + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { + int old = atomicAdd(&semaphores[blockIdx.x], 1); + done = (old == (gridDim.y - 1)); + } + __syncthreads(); + return done; +} + #undef EXPAND_REDUCE_INT32_MARCO #undef EXPAND_REDUCE_INT64_MARCO #undef EXPAND_REDUCE_FP32_MACRO diff --git a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc index da52f3ed481d6..5e7514640f8a7 100644 --- a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc +++ b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc @@ -165,6 +165,28 @@ CINN_REGISTER_HELPER(cuda_intrinsics_reduce) { #undef REGISTER_DISCRETE_REDUCE_INTERNAL_FUNC_IMPL +#define REGISTER_GRID_REDUCE_FUNC_IMPL(REDUCE_TYPE, DTYPE) \ + REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_grid_reduce_##REDUCE_TYPE, target) \ + .SetRetType() \ + .AddInputType() \ + .AddInputType() \ + .End(); + + EXPAND_REDUCE_INT32_REGISTER_MARCO(REGISTER_GRID_REDUCE_FUNC_IMPL) + EXPAND_REDUCE_INT64_REGISTER_MARCO(REGISTER_GRID_REDUCE_FUNC_IMPL) + EXPAND_REDUCE_BF16_REGISTER_MACRO(REGISTER_GRID_REDUCE_FUNC_IMPL) + EXPAND_REDUCE_FP16_REGISTER_MACRO(REGISTER_GRID_REDUCE_FUNC_IMPL) + EXPAND_REDUCE_FP32_REGISTER_MACRO(REGISTER_GRID_REDUCE_FUNC_IMPL) + EXPAND_REDUCE_FP64_REGISTER_MACRO(REGISTER_GRID_REDUCE_FUNC_IMPL) + EXPAND_REDUCE_BOOL_REGISTER_MACRO(REGISTER_GRID_REDUCE_FUNC_IMPL) + +#undef REGISTER_GRID_REDUCE_FUNC_IMPL + + REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_grid_reduce_update_semaphore, target) + .SetRetType() + .AddInputType() + .End(); + #define REGISTER_BLOCK_REDUCE_FUNC_IMPL(REDUCE_TYPE, DTYPE) \ REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_block_reduce_##REDUCE_TYPE, target) \ .SetRetType() \ diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt index 802de8c5b7aa5..6a3f0b00ecea5 100644 --- a/test/cpp/pir/cinn/CMakeLists.txt +++ b/test/cpp/pir/cinn/CMakeLists.txt @@ -39,6 +39,9 @@ if(WITH_TESTING AND WITH_CINN) paddle_test(test_file_tile_config SRCS file_tile_config_test.cc) + paddle_test(replace_cross_block_reduction_test SRCS + replace_cross_block_reduction_test.cc) + # DO NOT forget add test name here, otherwise it will not be executed in # CINN CI. set(cinn_unit_tests @@ -52,7 +55,8 @@ if(WITH_TESTING AND WITH_CINN) merge_parallel_matmul_pass_test test_tile_config_searcher test_tile_config_searcher_pure_spatial - test_file_tile_config) + test_file_tile_config + replace_cross_block_reduction_test) foreach(test_name ${cinn_unit_tests}) get_property( diff --git a/test/cpp/pir/cinn/replace_cross_block_reduction_test.cc b/test/cpp/pir/cinn/replace_cross_block_reduction_test.cc new file mode 100644 index 0000000000000..c05b8077e8b9e --- /dev/null +++ b/test/cpp/pir/cinn/replace_cross_block_reduction_test.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2024 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/optim/replace_cross_block_reduction.h" + +#include + +#include "paddle/cinn/cinn.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/utils/string.h" + +namespace cinn { +namespace optim { + +TEST(CrossBlockReductionReplacer, SRLayout) { + Context::Global().ResetNameId(); + + Placeholder A("A", {Expr(8), Expr(16)}); + Var reduce_k(8, "reduce_k"); + ir::Tensor B = Compute( + {Expr(16)}, + [&](Var i) { return lang::ReduceSum(A(reduce_k, i), {reduce_k}); }, + "B"); + ir::Tensor C = Compute( + {Expr(16)}, [&](Var i) { return lang::Sqrt(B(i)); }, "C"); + + ast_gen_ius::TensorGroup tensor_group({A, B, C}); + auto func = lang::LowerToAst("reduce_sum_sqrt", {C}, &tensor_group); + + ir::ModuleExpr mod_expr({func->body}); + ir::IRSchedule ir_sch(mod_expr); + + ir_sch.Bind(ir_sch.GetLoops("B")[0], "blockIdx.x"); + ir_sch.Bind(ir_sch.GetLoops("B")[1], "blockIdx.y"); + ir_sch.Bind(ir_sch.GetLoops("C")[0], "blockIdx.x"); + + func->body = ir_sch.GetModule().GetExprs()[0]; + A->WithBuffer("global", "_A"); + B->WithBuffer("local", "_B_temp_buffer"); + func->temp_bufs = {A->buffer, B->buffer}; + + VLOG(6) << "Before ReplaceCrossBlockReduction: " << func; + auto expr_func = Expr(func); + ReplaceCrossBlockReduction(&expr_func); + VLOG(6) << "After ReplaceCrossBlockReduction: " << func; + + EXPECT_EQ(utils::GetStreamCnt(func), + utils::Trim(R"ROC(function reduce_sum_sqrt (_C, _A, _semaphore) +{ + ScheduleBlock(root) + { + { + thread_bind[blockIdx.x] for (i, 0, 16) + { + ScheduleBlock(B__reduce_init) + { + i0 = axis.bind(i) + B__reduce_init[i0] = 0.00000000f + } + thread_bind[blockIdx.y] for (reduce_k, 0, 8) + { + is_last_block_done[0] = cinn_grid_reduce_update_semaphore(Tensor(semaphore, [16])) + if (is_last_block_done[0]) { + ScheduleBlock(B) + { + i0_0, i1 = axis.bind(i, reduce_k) + B[i0_0] = cinn_grid_reduce_sum_fp32(Tensor(A, [8,16]), 1) + } + } + } + } + thread_bind[blockIdx.x] for (i, 0, 16) + { + if (is_last_block_done[0]) { + ScheduleBlock(C) + { + i0_1 = axis.bind(i) + C[i0_1] = sqrt(B[i0_1]) + } + } + } + } + } +} +)ROC")); + EXPECT_EQ(func->temp_spaces.size(), 2); + EXPECT_EQ(func->temp_spaces[0].size().as_int64(), 512); + EXPECT_EQ(func->temp_spaces[0].arg_idx(), 1); + EXPECT_EQ(func->temp_spaces[0].need_zero_init(), false); + EXPECT_EQ(func->temp_spaces[1].size().as_int64(), 64); + EXPECT_EQ(func->temp_spaces[1].arg_idx(), 2); + EXPECT_EQ(func->temp_spaces[1].need_zero_init(), true); +} + +TEST(CrossBlockReductionReplacer, RSLayout) { + Context::Global().ResetNameId(); + + Placeholder A("A", {Expr(8), Expr(4), Expr(32)}); + Var reduce_k(8, "reduce_k"); + ir::Tensor B = Compute( + {Expr(4), Expr(32)}, + [&](Var i, Var j) { + return lang::ReduceMax(A(reduce_k, i, j), {reduce_k}); + }, + "B"); + ir::Tensor C = Compute( + {Expr(4), Expr(32)}, + [&](Var i, Var j) { return lang::Exp(B(i, j)); }, + "C"); + + ast_gen_ius::TensorGroup tensor_group({A, B, C}); + auto func = lang::LowerToAst("reduce_max_exp", {C}, &tensor_group); + + ir::ModuleExpr mod_expr({func->body}); + ir::IRSchedule ir_sch(mod_expr); + + ir_sch.Bind(ir_sch.GetLoops("B")[0], "blockIdx.x"); + ir_sch.Bind(ir_sch.GetLoops("B")[1], "threadIdx.x"); + ir_sch.Bind(ir_sch.GetLoops("B")[2], "blockIdx.y"); + ir_sch.Bind(ir_sch.GetLoops("C")[0], "blockIdx.x"); + ir_sch.Bind(ir_sch.GetLoops("C")[1], "threadIdx.x"); + + func->body = ir_sch.GetModule().GetExprs()[0]; + A->WithBuffer("global", "_A"); + B->WithBuffer("local", "_B_temp_buffer"); + func->temp_bufs = {A->buffer, B->buffer}; + + VLOG(6) << "Before ReplaceCrossBlockReduction: " << func; + auto expr_func = Expr(func); + ReplaceCrossBlockReduction(&expr_func); + VLOG(6) << "After ReplaceCrossBlockReduction: " << func; + + EXPECT_EQ(utils::GetStreamCnt(func), + utils::Trim(R"ROC(function reduce_max_exp (_C, _A, _semaphore) +{ + ScheduleBlock(root) + { + { + thread_bind[blockIdx.x] for (i, 0, 4) + { + thread_bind[threadIdx.x] for (j, 0, 32) + { + ScheduleBlock(B__reduce_init) + { + i0, i1 = axis.bind(i, j) + B__reduce_init[i0, i1] = -3.40282346e+38f + } + thread_bind[blockIdx.y] for (reduce_k, 0, 8) + { + is_last_block_done[0] = cinn_grid_reduce_update_semaphore(Tensor(semaphore, [4])) + if (is_last_block_done[0]) { + ScheduleBlock(B) + { + i0_0, i1_0, i2 = axis.bind(i, j, reduce_k) + B[i0_0, i1_0] = cinn_grid_reduce_max_fp32(Tensor(A, [8,4,32]), 32) + } + } + } + } + } + thread_bind[blockIdx.x] for (i, 0, 4) + { + thread_bind[threadIdx.x] for (j, 0, 32) + { + if (is_last_block_done[0]) { + ScheduleBlock(C) + { + i0_1, i1_1 = axis.bind(i, j) + C[i0_1, i1_1] = exp(B[i0_1, i1_1]) + } + } + } + } + } + } +} +)ROC")); + EXPECT_EQ(func->temp_spaces.size(), 2); + EXPECT_EQ(func->temp_spaces[0].size().as_int64(), 4096); + EXPECT_EQ(func->temp_spaces[0].arg_idx(), 1); + EXPECT_EQ(func->temp_spaces[0].need_zero_init(), false); + EXPECT_EQ(func->temp_spaces[1].size().as_int64(), 16); + EXPECT_EQ(func->temp_spaces[1].arg_idx(), 2); + EXPECT_EQ(func->temp_spaces[1].need_zero_init(), true); +} + +} // namespace optim +} // namespace cinn From f093ac2adc03be96cceef7d3750c25148248d060 Mon Sep 17 00:00:00 2001 From: Zhan Rongrui <46243324+zrr1999@users.noreply.github.com> Date: Fri, 11 Oct 2024 22:45:13 +0800 Subject: [PATCH 082/135] [SOT][dynamic shape] optimize fallback mechanism (#68616) --- .../opcode_translator/executor/function_graph.py | 13 ++++++++++--- test/sot/test_sot_dynamic_shape.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py index 5554b6e1a5f11..6a0067f5bfdab 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py +++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py @@ -636,10 +636,13 @@ def try_infer_meta_fn(args, kwargs) -> Any: metas = convert_to_meta(args) kwmetas = convert_to_meta(kwargs) return args, kwargs, infer_meta_fn(func, *metas, **kwmetas) - except NotSupportedTensorArgumentError as e: + except (NotSupportedTensorArgumentError, TypeError) as e: bound_arguments = inspect.signature(func).bind(*args, **kwargs) bound_arguments.apply_defaults() - if e.name in bound_arguments.arguments: + if ( + isinstance(e, NotSupportedTensorArgumentError) + and e.name in bound_arguments.arguments + ): original_var = bound_arguments.arguments[e.name] flatten_vars = original_var.flatten_items() if not any( @@ -659,7 +662,11 @@ def try_infer_meta_fn(args, kwargs) -> Any: ) else: flatten_vars = reduce( - lambda x, y: x + y.flatten_items(), + lambda x, y: ( + x + y.flatten_items() + if isinstance(y, VariableBase) + else x + ), bound_arguments.arguments.values(), [], ) diff --git a/test/sot/test_sot_dynamic_shape.py b/test/sot/test_sot_dynamic_shape.py index 4ef8484da6a8f..97b763490cb27 100644 --- a/test/sot/test_sot_dynamic_shape.py +++ b/test/sot/test_sot_dynamic_shape.py @@ -173,7 +173,7 @@ def test_dynamic_shape_in_list(self): ) self.assertEqual(ctx.translate_count, 2) - def test_dynamic_shape_fallback(self): + def test_conv_dynamic_shape_fallback(self): with with_allow_dynamic_shape_guard( True ), test_instruction_translator_cache_context() as ctx: @@ -182,6 +182,17 @@ def test_dynamic_shape_fallback(self): conv(paddle.randn([1, 3, 224, 224])) self.assertEqual(ctx.translate_count, i) + def test_pad_dynamic_shape_fallback(self): + with with_allow_dynamic_shape_guard( + True + ), test_instruction_translator_cache_context() as ctx: + pad_func = check_no_breakgraph( + lambda x, n: paddle.nn.functional.pad(x, [0, n, 0, 0]) + ) + for i in range(1, 5): + self.assert_results(pad_func, paddle.randn([1, 3, 224, 224]), i) + self.assertEqual(ctx.translate_count, i) + if __name__ == '__main__': unittest.main() From 092f378422eeb3ca377359b196e42d53f4b64d57 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Sat, 12 Oct 2024 10:11:08 +0800 Subject: [PATCH 083/135] [DRR] fix drr rewrite bug (#68642) --- paddle/fluid/pir/drr/src/rewrite_pattern.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc index 4bcac16172bc5..9dce8f3d02a42 100644 --- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc +++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc @@ -545,7 +545,7 @@ MatchContextImpl DrrRewritePattern::CreateOperations( } // 3. insert new op at point max(max_res_idx+1, min_src_idx) - if (min_src_idx > max_res_idx) { + if (min_src_idx > max_res_idx || max_res_idx_op == nullptr) { rewriter.set_insertion_point(min_src_idx_op); } else { rewriter.SetInsertionPointAfter(max_res_idx_op); From 4daca526b7d7e781903748ede9d3daa32d4972d0 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Sat, 12 Oct 2024 10:50:59 +0800 Subject: [PATCH 084/135] modify api dropout dttostaic bug (#68637) --- python/paddle/autograd/backward_utils.py | 3 --- python/paddle/nn/functional/common.py | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py index 643cf11abe6c3..ad470b614d689 100644 --- a/python/paddle/autograd/backward_utils.py +++ b/python/paddle/autograd/backward_utils.py @@ -318,9 +318,6 @@ def _check_vjp_dynamic_shape(op, inputs): for items in inputs: for item in items: if item.initialized() and -1 in item.shape: - warnings.warn( - f"[Prim] Decomp op does not support dynamic shape -1, but got shape {item.shape} in inputs of op {op.name()} . Prim will skip its vjp op." - ) return True diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 3cb723d1f3d6c..7c78be4a99dd0 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -1274,7 +1274,9 @@ def get_attrs(prog, dropout_prob, is_test, seed): dtype = x.dtype keep_prob = 1 - p if training: - if in_dynamic_or_pir_mode() and p == 1.0: + if in_dynamic_mode() and p == 1.0: + return paddle.scale(x, scale=0.0) + elif in_pir_mode() and isinstance(p, (float, int)) and p == 1.0: return paddle.scale(x, scale=0.0) scale_input = ( From 15067264413456019498389134c794aed247c33d Mon Sep 17 00:00:00 2001 From: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com> Date: Sat, 12 Oct 2024 10:52:36 +0800 Subject: [PATCH 085/135] Fix FusedMultitransformer ops yaml (#68595) * fix FusedMultitransformer kernel declaration * add notes --- paddle/phi/infermeta/fusion.cc | 4 ++-- paddle/phi/infermeta/fusion.h | 4 ++-- .../kernels/fusion/gpu/fused_multi_transformer_kernel.cu | 8 ++++++-- paddle/phi/ops/yaml/ops.yaml | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index fa814545c0c13..ed2ea748e6782 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -134,7 +134,7 @@ void GroupNormalizeSiluXPUInferMeta(const MetaTensor& x, void FusedMultiTransformerInferMeta( const MetaTensor& x, const std::vector& ln_scales, - const std::vector& ln_biases, + const paddle::optional>& ln_biases, const std::vector& qkv_weights, const paddle::optional>& qkv_biases, const paddle::optional>& cache_kvs, @@ -147,7 +147,7 @@ void FusedMultiTransformerInferMeta( const std::vector& out_linear_weights, const paddle::optional>& out_linear_biases, const std::vector& ffn_ln_scales, - const std::vector& ffn_ln_biases, + const paddle::optional>& ffn_ln_biases, const std::vector& ffn1_weights, const paddle::optional>& ffn1_biases, const std::vector& ffn2_weights, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index 2cbdc389a24c6..1e68f77222064 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -25,7 +25,7 @@ namespace phi { void FusedMultiTransformerInferMeta( const MetaTensor& x, const std::vector& ln_scales, - const std::vector& ln_biases, + const paddle::optional>& ln_biases, const std::vector& qkv_weights, const paddle::optional>& qkv_biases, const paddle::optional>& cache_kvs, @@ -38,7 +38,7 @@ void FusedMultiTransformerInferMeta( const std::vector& out_linear_weights, const paddle::optional>& out_linear_biases, const std::vector& ffn_ln_scales, - const std::vector& ffn_ln_biases, + const paddle::optional>& ffn_ln_biases, const std::vector& ffn1_weights, const paddle::optional>& ffn1_biases, const std::vector& ffn2_weights, diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu index 870110f0900c6..7ef98654ade71 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_kernel.cu @@ -31,7 +31,7 @@ void FusedMultiTransformerOpKernel( const Context &dev_ctx, const DenseTensor &x, const std::vector &ln_scales, - const std::vector &ln_biases, + const paddle::optional> &ln_biases_in, const std::vector &qkv_weights, const paddle::optional> &qkv_biases_in, const paddle::optional> &cache_kvs_in, @@ -45,7 +45,7 @@ void FusedMultiTransformerOpKernel( const paddle::optional> &out_linear_biases_in, const std::vector &ffn_ln_scales, - const std::vector &ffn_ln_biases, + const paddle::optional> &ffn_ln_biases_in, const std::vector &ffn1_weights, const paddle::optional> &ffn1_biases_in, const std::vector &ffn2_weights, @@ -76,6 +76,10 @@ void FusedMultiTransformerOpKernel( int dim_embed = input_x_dims[2]; int bsz_seq = bsz * seq_len; + // Optional Bias input for LayerNorm / RMSNorm + auto ln_biases = ln_biases_in.get(); + auto ffn_ln_biases = ffn_ln_biases_in.get(); + bool use_glu = (act_method == "geglu" || act_method == "swiglu"); bool remove_padding = false; diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index f4e9b876ce538..3ec01a70bd55a 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -2165,7 +2165,7 @@ - op : fused_multi_transformer args : (Tensor x, Tensor[] ln_scales, Tensor[] ln_biases, Tensor[] qkv_weights, Tensor[] qkv_biases, Tensor[] cache_kvs, Tensor[] pre_caches, Tensor rotary_tensor, Tensor beam_offset, Tensor time_step, Tensor seq_lengths, Tensor src_mask, Tensor[] out_linear_weights, Tensor[] out_linear_biases, Tensor[] ffn_ln_scales, Tensor[] ffn_ln_biases, Tensor[] ffn1_weights, Tensor[] ffn1_biases, Tensor[] ffn2_weights, Tensor[] ffn2_biases, bool pre_layer_norm = true, float epsilon = 1e-5, float residual_alpha = 1.0f, float dropout_rate = .5f, int rotary_emb_dims = 0, bool is_test = false, str dropout_implementation = "downgrade_in_infer", str act_method = "gelu", bool trans_qkvw = true, int ring_id = -1, str norm_type = "layernorm", bool use_neox_rotary_style=true, int gqa_group_size=-1) - optional : qkv_biases, cache_kvs, pre_caches, rotary_tensor, beam_offset, time_step, seq_lengths, src_mask, out_linear_biases, ffn1_biases, ffn2_biases, cache_kv_outs + optional : ln_biases, qkv_biases, cache_kvs, pre_caches, rotary_tensor, beam_offset, time_step, seq_lengths, src_mask, out_linear_biases, ffn_ln_biases, ffn1_biases, ffn2_biases, cache_kv_outs output : Tensor[](cache_kv_outs){out_linear_weights.size()}, Tensor(out) infer_meta : func : FusedMultiTransformerInferMeta From 899ace552eba89abbbdbb4bf44c9c453e4a20843 Mon Sep 17 00:00:00 2001 From: joseflv <42016714+joseflv@users.noreply.github.com> Date: Sat, 12 Oct 2024 10:58:56 +0800 Subject: [PATCH 086/135] fix core: add spin lock when auto_growth_allocator release memory (#68631) --- .../phi/core/memory/allocation/auto_growth_best_fit_allocator.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.h index 7f25c1f963874..bcd7ee2810864 100644 --- a/paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/phi/core/memory/allocation/auto_growth_best_fit_allocator.h @@ -46,6 +46,7 @@ class AutoGrowthBestFitAllocator : public Allocator { // Release the memory block which is not used in pool. uint64_t ReleaseImpl(const phi::Place &place) override { + std::lock_guard guard(spinlock_); return FreeIdleChunks(); } From c0460428068101b0941f77f8e8b45581879e842c Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Sat, 12 Oct 2024 11:00:27 +0800 Subject: [PATCH 087/135] [PIR] Make get PIR flag from env logic more robust (#68639) * [PIR] Make get PIR flag from env logic more robust * [PIR] Make get PIR flag from env logic more robust * refine code --- python/paddle/base/framework.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index 8708f506e0e99..14dbb58bbd4aa 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -212,16 +212,18 @@ def __init__(self): self._in_sot_simulation_mode_ = False self._functional_dygraph_context_manager = None self._dygraph_tracer_ = _dygraph_tracer_ - tmp_flags = os.environ.get("FLAGS_enable_pir_api") - if tmp_flags is not None: - if ( - tmp_flags == "0" - or tmp_flags == 0 - or tmp_flags == "False" - or not tmp_flags - ): - tmp_flags = False - set_flags({"FLAGS_enable_pir_api": bool(tmp_flags)}) + env_pir_enabled = os.environ.get("FLAGS_enable_pir_api") + + if env_pir_enabled is not None: + pir_enabled = env_pir_enabled.lower() not in [ + 'n', + 'no', + 'f', + 'false', + 'off', + '0', + ] + set_flags({"FLAGS_enable_pir_api": pir_enabled}) self._use_pir_api_ = get_flags("FLAGS_enable_pir_api")[ "FLAGS_enable_pir_api" ] From 653f60596162535973ed5da849c2eea69d67dd68 Mon Sep 17 00:00:00 2001 From: ZhenxingLi Date: Sat, 12 Oct 2024 11:18:35 +0800 Subject: [PATCH 088/135] =?UTF-8?q?=E3=80=90AutoParallel=E3=80=91Add=20c?= =?UTF-8?q?=5Fembedding=20pass=20in=20PIR=20(#68389)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pir/dialect/op_generator/ops_api_gen.py | 1 - .../phi/infermeta/spmd_rules/c_embedding.cc | 200 ++++++++++ paddle/phi/infermeta/spmd_rules/c_embedding.h | 33 ++ paddle/phi/infermeta/spmd_rules/rules.cc | 3 + paddle/phi/infermeta/spmd_rules/rules.h | 1 + .../yaml/inconsistent/static_backward.yaml | 1 + .../phi/ops/yaml/inconsistent/static_ops.yaml | 1 + .../distributed/auto_parallel/constants.py | 1 + .../auto_parallel/static/engine.py | 10 + python/paddle/distributed/passes/__init__.py | 3 + .../passes/auto_parallel_c_embedding.py | 345 ++++++++++++++++++ test/auto_parallel/pir/CMakeLists.txt | 6 + .../pir/auto_parallel_c_embedding_pass.py | 201 ++++++++++ .../test_auto_parallel_c_embedding_pass.py | 44 +++ .../spmd_rules/test_c_embedding_rule.py | 146 ++++++++ 15 files changed, 995 insertions(+), 1 deletion(-) create mode 100644 paddle/phi/infermeta/spmd_rules/c_embedding.cc create mode 100644 paddle/phi/infermeta/spmd_rules/c_embedding.h create mode 100644 python/paddle/distributed/passes/auto_parallel_c_embedding.py create mode 100644 test/auto_parallel/pir/auto_parallel_c_embedding_pass.py create mode 100644 test/auto_parallel/pir/test_auto_parallel_c_embedding_pass.py create mode 100644 test/auto_parallel/spmd_rules/test_c_embedding_rule.py diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index b5443b66351b2..3ee47850f4dcb 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -164,7 +164,6 @@ 'barrier', 'c_allreduce_min', 'c_allreduce_prod', - 'c_embedding', 'c_identity', 'c_reduce_sum', 'c_softmax_with_cross_entropy', diff --git a/paddle/phi/infermeta/spmd_rules/c_embedding.cc b/paddle/phi/infermeta/spmd_rules/c_embedding.cc new file mode 100644 index 0000000000000..073a885a59a58 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/c_embedding.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/infermeta/spmd_rules/c_embedding.h" +#include "paddle/phi/infermeta/spmd_rules/embedding.h" + +#include "glog/logging.h" + +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" +#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" +#include "paddle/phi/core/distributed/auto_parallel/utils.h" +#include "paddle/phi/infermeta/spmd_rules/matmul.h" +#include "paddle/phi/infermeta/spmd_rules/reshape.h" +#include "paddle/phi/infermeta/spmd_rules/utils.h" + +namespace phi::distributed { + +using phi::distributed::auto_parallel::str_join; + +SpmdInfo CEmbeddingInferSpmd(const DistMetaTensor& weight, + const DistMetaTensor& x, + int start_index, + int vocab_size) { + // Step0: Verify input args based on c_embedding logic + auto x_shape = common::vectorize(x.dims()); + auto weight_shape = common::vectorize(weight.dims()); + int x_ndim = static_cast(x_shape.size()); + int weight_ndim = static_cast(weight_shape.size()); + auto x_dist_attr_src = x.dist_attr(); + auto weight_dist_attr_src = weight.dist_attr(); + std::vector x_dims_mapping = x_dist_attr_src.dims_mapping(); + std::vector weight_dims_mapping = + weight_dist_attr_src.dims_mapping(); + PADDLE_ENFORCE_EQ( + x_ndim, + x_dims_mapping.size(), + common::errors::InvalidArgument("The Tensor X's rank [%d] and X's " + "dims_mapping size [%d] are not matched.", + x_ndim, + x_dims_mapping.size())); + PADDLE_ENFORCE_EQ( + weight_ndim, + weight_dims_mapping.size(), + common::errors::InvalidArgument("Tensor W's tensor rank [%d] and W's " + "dims_mapping size [%d] are not matched.", + weight_ndim, + weight_dims_mapping.size())); + PADDLE_ENFORCE_EQ(weight_ndim, + 2, + common::errors::InvalidArgument( + "CEmbedding table should have TWO dimension, " + "but got a tensor with [%d] dimension.", + weight_ndim)); + // determine parallel mode + int64_t weight_row_axis_mapping = weight_dims_mapping[0]; + + // Step1: Build Einsum Notation + std::string alphabet = "abcdefghilmnopqrstuvwxyz"; + std::string x_axes = GetBroadcastAxes(x_ndim, x_ndim, alphabet); + std::string weight_axes = "jk"; + std::string out_axes = x_axes + "k"; + + // Step2: Sharding Propagation + // Step2.1: merge input shardings + auto axis_to_dim_map = ShardingMergeForTensors( + {{x_axes, x_dims_mapping}, {weight_axes, weight_dims_mapping}}, false); + + // Step2.2: infer output's dims mapping. + TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src); + std::vector out_dims_mapping = + GetDimsMappingForAxes(out_axes, axis_to_dim_map); + out_dist_attr.set_dims_mapping(out_dims_mapping); + + // Step2.3: merge potential conflict in inputs, + TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src); + x_dist_attr_dst.set_dims_mapping( + GetDimsMappingForAxes(x_axes, axis_to_dim_map)); + TensorDistAttr weight_dist_attr_dst = + CopyTensorDistAttrForOutput(weight_dist_attr_src); + weight_dist_attr_dst.set_dims_mapping( + GetDimsMappingForAxes(weight_axes, axis_to_dim_map)); + + // Step3: Handle Partial + std::vector partial_on_dims; + if (weight_row_axis_mapping > -1) { + partial_on_dims.push_back(weight_row_axis_mapping); + } + out_dist_attr.set_partial_status(partial_on_dims); + VLOG(4) << "CEmbeddingInferSpmd:"; + VLOG(4) << "start_index: " << start_index; + VLOG(4) << "vocab_size: " << vocab_size; + LogInputDistAttr( + "Weight", weight_shape, weight.dist_attr(), weight_dist_attr_dst); + LogInputDistAttr("X", x_shape, x.dist_attr(), x_dist_attr_dst); + LogOutputDistAttr("Out", out_dist_attr); + VLOG(4) << std::endl; + + return {{weight_dist_attr_dst, x_dist_attr_dst}, {out_dist_attr}}; +} + +SpmdInfo CEmbeddingGradInferSpmd(const DistMetaTensor& weight, + const DistMetaTensor& x, + const DistMetaTensor& out_grad, + int start_index) { + PADDLE_ENFORCE_EQ(out_grad.dims().size(), + out_grad.dist_attr().dims_mapping().size(), + common::errors::InvalidArgument( + "The Tensor out_grad's rank [%d] and out_grad's " + "dims_mapping size [%d] are not matched.", + out_grad.dims(), + out_grad.dist_attr().dims_mapping().size())); + // primitive operators. + DistMetaTensor x_dst(x.dims(), x.dist_attr()); + DistMetaTensor w_dst(weight.dims(), weight.dist_attr()); + DistMetaTensor out_grad_dst(out_grad.dims(), out_grad.dist_attr()); + DistMetaTensor w_grad(weight.dims(), weight.dist_attr()); + + // Step1: t0 = onehot(x_dst, w_dst.shape[0]) = eye(num_classes)[x_dst] + auto t0_dims_mapping = x_dst.dist_attr().dims_mapping(); + t0_dims_mapping.emplace_back(-1); + TensorDistAttr t0_dist_attr(x.dist_attr()); + t0_dist_attr.set_dims_mapping(t0_dims_mapping); + auto t0_shape = phi::vectorize(x.dims()); + t0_shape.emplace_back(w_dst.dims()[0]); + DistMetaTensor t0(phi::make_ddim(t0_shape), t0_dist_attr); + + // Step2: w_grad = einsum('...j, ...k -> jk', t0, out_grad_dst) + // Step 2.1: Build Einsum Notation + std::string alphabet = "abcdefghijlmnopqrstuvwxyz"; + std::string t0_axes = + GetBroadcastAxes(t0.dims().size(), t0.dims().size(), alphabet); + std::string out_grad_dst_axes = t0_axes.substr(0, t0_axes.length() - 1) + "k"; + std::string w_grad_axes = t0_axes.substr(t0_axes.length() - 1, 1) + "k"; + + // Step2.2: Sharding Propagation + // Step2.2.1: merge input shardings + auto axis_to_dim_map = ShardingMergeForTensors( + {{t0_axes, t0.dist_attr().dims_mapping()}, + {out_grad_dst_axes, out_grad_dst.dist_attr().dims_mapping()}}, + false); + + // Step2.2.2: infer output's dims mapping. + auto w_grad_dist_attr = w_grad.dist_attr(); + std::vector w_grad_dims_mapping = + GetDimsMappingForAxes(w_grad_axes, axis_to_dim_map); + + // Step2.2.3: merge potential conflict in inputs, + t0_dist_attr.set_dims_mapping( + GetDimsMappingForAxes(t0_axes, axis_to_dim_map)); + auto out_grad_dst_dist_attr = + CopyTensorDistAttrForOutput(out_grad_dst.dist_attr()); + out_grad_dst_dist_attr.set_dims_mapping( + GetDimsMappingForAxes(out_grad_dst_axes, axis_to_dim_map)); + + // Step2.2.4: Handle Partial + std::vector partial_on_dims = + ResoluteOutputPartialDimension(axis_to_dim_map, w_grad_axes); + w_grad_dist_attr.set_partial_status(partial_on_dims); + + // Step2.3: Update inputs info. + t0 = DistMetaTensor(t0.dims(), t0_dist_attr); + const auto& t0_dims = t0.dist_attr().dims_mapping(); + std::vector new_dims_mapping(t0_dims.begin(), t0_dims.end() - 1); + if (x_dst.dist_attr().dims_mapping() != new_dims_mapping) { + TensorDistAttr t1(t0.dist_attr()); + t1.set_dims_mapping(new_dims_mapping); + x_dst = DistMetaTensor(x_dst.dims(), t1); + } + out_grad_dst = DistMetaTensor(out_grad_dst.dims(), out_grad_dst_dist_attr); + w_grad = DistMetaTensor(w_grad.dims(), w_grad_dist_attr); + VLOG(4) << "CEmbeddingGradInferSpmd:"; + VLOG(4) << "start_index: " << start_index; + LogInputDistAttr("Weight", + phi::vectorize(weight.dims()), + weight.dist_attr(), + w_dst.dist_attr()); + LogInputDistAttr( + "X", phi::vectorize(x.dims()), x.dist_attr(), x_dst.dist_attr()); + LogInputDistAttr("OutGrad", + phi::vectorize(out_grad.dims()), + out_grad.dist_attr(), + out_grad_dst.dist_attr()); + LogOutputDistAttr("WGrad", w_grad.dist_attr()); + VLOG(4) << std::endl; + return {{w_dst.dist_attr(), x_dst.dist_attr(), out_grad_dst.dist_attr()}, + {w_grad.dist_attr()}}; +} + +} // namespace phi::distributed diff --git a/paddle/phi/infermeta/spmd_rules/c_embedding.h b/paddle/phi/infermeta/spmd_rules/c_embedding.h new file mode 100644 index 0000000000000..3f3a9325270b0 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/c_embedding.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h" +#include "paddle/phi/core/distributed/type_defs.h" + +namespace phi { +namespace distributed { + +SpmdInfo CEmbeddingInferSpmd(const DistMetaTensor& weight, + const DistMetaTensor& x, + int start_index, + int vocab_size); + +SpmdInfo CEmbeddingGradInferSpmd(const DistMetaTensor& weight, + const DistMetaTensor& x, + const DistMetaTensor& out_grad, + int start_index); +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc index 99deb1728aa0f..b100620ee0ba7 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.cc +++ b/paddle/phi/infermeta/spmd_rules/rules.cc @@ -561,6 +561,9 @@ PD_REGISTER_SPMD_RULE( embedding, PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmd), PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmdReverse)); +PD_REGISTER_SPMD_RULE(c_embedding, + PD_INFER_SPMD(phi::distributed::CEmbeddingInferSpmd), + PD_INFER_SPMD(phi::distributed::CEmbeddingGradInferSpmd)); PD_REGISTER_SPMD_RULE( lookup_table_v2, PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmd), diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h index 697019f755fcb..f99cb45014560 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.h +++ b/paddle/phi/infermeta/spmd_rules/rules.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/spmd_rules/add_n.h" #include "paddle/phi/infermeta/spmd_rules/amp_ops.h" #include "paddle/phi/infermeta/spmd_rules/argmax.h" +#include "paddle/phi/infermeta/spmd_rules/c_embedding.h" #include "paddle/phi/infermeta/spmd_rules/cast.h" #include "paddle/phi/infermeta/spmd_rules/concat.h" #include "paddle/phi/infermeta/spmd_rules/conv2d.h" diff --git a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml index 943107c514ad2..8e5533ead94f9 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml @@ -102,6 +102,7 @@ infer_meta : func : EmbeddingGradInferMeta param : [x, weight] + spmd_rule : CEmbeddingGradInferSpmd kernel : func : c_embedding_grad no_need_buffer : weight diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index 639f904516963..0a7b706ea40e0 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -119,6 +119,7 @@ infer_meta : func : CEmbeddingInferMeta param : [weight, x, start_index] + spmd_rule: CEmbeddingInferSpmd kernel : func : c_embedding param : [weight, x, start_index, vocab_size] diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py index 1165ddbbf3819..3df32cfaece3e 100644 --- a/python/paddle/distributed/auto_parallel/constants.py +++ b/python/paddle/distributed/auto_parallel/constants.py @@ -340,6 +340,7 @@ class _DPOptimizationConfig(TypedDict, total=False): # noqa: PYI049 set_field_default_config( MP_OPTIMIZATION, "allreduce_matmul_grad_overlapping", False ) +set_field_default_config(MP_OPTIMIZATION, "replace_with_c_embedding", False) if TYPE_CHECKING: diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index aec38a996cc10..d9998e7dee00c 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -736,6 +736,16 @@ def _parallel_pir(self, mode): dist_program, startup_program, self._strategy.pipeline ) + if self._strategy.mp_optimization.replace_with_c_embedding: + config = {} + config["concrete_program"] = self.concrete_program + auto_parallel_c_embedding_pass = new_pass( + "auto_parallel_c_embedding_pass", config + ) + auto_parallel_c_embedding_pass.apply( + [dist_program], [startup_program] + ) + # Step 1.2: pir backward if mode == "train" and self._loss and self._optimizer: loss = dist_program.get_output_value_by_name(self._loss_names[0]) diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py index a89e72c46775d..9dfd3d5a8ca90 100644 --- a/python/paddle/distributed/passes/__init__.py +++ b/python/paddle/distributed/passes/__init__.py @@ -20,6 +20,9 @@ AMPPass, AMPState, ) +from .auto_parallel_c_embedding import ( # noqa: F401 + AutoParallelCEmbeddingPass, +) from .auto_parallel_data_parallel_optimization import ( # noqa: F401 DataParallelOptimizationPass, GradientsGroup, diff --git a/python/paddle/distributed/passes/auto_parallel_c_embedding.py b/python/paddle/distributed/passes/auto_parallel_c_embedding.py new file mode 100644 index 0000000000000..4e3f3f242302a --- /dev/null +++ b/python/paddle/distributed/passes/auto_parallel_c_embedding.py @@ -0,0 +1,345 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import warnings + +import paddle +import paddle.distributed as dist +from paddle.base.core import TensorDistAttr +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, +) +from paddle.distributed.fleet.meta_optimizers.common import OpRole +from paddle.framework import core + +from .pass_base import PassBase, register_pass + + +@register_pass("auto_parallel_c_embedding_pass") +class AutoParallelCEmbeddingPass(PassBase): + def __init__(self): + super().__init__() + + def _check_self(self): + hcg = fleet.get_hybrid_communicate_group() + mp_size = hcg.get_model_parallel_world_size() + if mp_size > 1: + return True + warnings.warn("c_embedding pass is only applicable to tnesor parallel.") + return False + + def _check_conflict(self, other_pass): + return True + + def _apply_single_impl(self, main_program, startup_program, context): + concrete_program = self.get_attr("concrete_program") + ops = main_program.global_block().ops + for i, op in enumerate(ops): + if op.name() == 'pd_op.embedding': + # update weight dims mapping + mp_axis = self._update_weight(op, concrete_program) + # update startup_program + self._update_startup_program(startup_program, mp_axis) + # replace embedding with c_embedding + c_emb_op = self._replace_embedding_with_c_embedding(op) + # insert allreduce reshard + comm_op = self._insert_allreduce_reshard(c_emb_op) + # update dims_mapping before c_embedding + self._update_before_dims_mapping(c_emb_op) + # update dims_mapping after c_embedding + self._update_after_dims_mapping(comm_op) + + def _update_weight(self, op, concrete_program): + # update weight dims_mapping concrete_program + placements = op.operand(1).source().placements + dim_map, partial_status = dist.auto_parallel.placement_type.to_dim_map( + placements, op.operand(1).source().ndim + ) + # mp_axis is used to specify the axis for row parallel + mp_axis = -1 + dim_map = [-1, -1] + hcg = fleet.get_hybrid_communicate_group() + mp_size = hcg.get_model_parallel_world_size() + if mp_size > 1: + strategy = fleet.DistributedStrategy() + # get mp_axis from DistributedStrategy + mp_axis = strategy.hybrid_configs['mp_degree'] + dim_map = [mp_axis, -1] + dist_attr_w = paddle.base.libpaddle.pir.create_tensor_dist_attribute( + op.operand(1).source().process_mesh, + dim_map, + partial_status, + ) + dist_type_input0 = paddle.base.libpaddle.pir.cvt_to_dist_type( + op.operand(1).source().type(), dist_attr_w + ) + op.operand(1).source().set_type(dist_type_input0) + # update c_emebedding weight dynamic parameters + dy_params = concrete_program.parameters[0] + pattern = re.compile(r'embedding_.*\.w_0\.dist') + for index, param in enumerate(dy_params): + if pattern.match(param.name): + var_dist_attr = TensorDistAttr() + var_dist_attr.process_mesh = dist_attr_w.process_mesh + var_dist_attr.dims_mapping = dist_attr_w.dims_mapping + tmp = paddle.base.core.reshard(param, var_dist_attr) + param.get_tensor()._share_data_with(tmp.get_tensor()) + return mp_axis + + def _replace_embedding_with_c_embedding(self, op): + paddle.pir.set_insertion_point(op) + num_embeddings = op.operand(1).source().type().shape[0] + hcg = fleet.get_hybrid_communicate_group() + # compute the start_index using the MP's world size and rank + mp_size = hcg.get_model_parallel_world_size() + mp_rank = hcg.get_model_parallel_rank() + per_part_size = num_embeddings // mp_size + vocab_start_index = mp_rank * per_part_size + t_op = paddle._C_ops.c_embedding( + op.operand(1).source(), + op.operand(0).source(), + vocab_start_index, + num_embeddings, + ) + t_op.get_defining_op().op_role = int(OpRole.Forward) + new_op = t_op.get_defining_op() + op.result(0).replace_all_uses_with(t_op) + op.erase() + return new_op + + def _insert_allreduce_reshard(self, c_emb_op): + result = c_emb_op.result(0) + paddle.pir.set_insertion_point_after(c_emb_op) + placements = result.dist_attr().placements + dim_map, partial_status = dist.auto_parallel.placement_type.to_dim_map( + placements, result.ndim + ) + partial_status = {} + dist_attr_new = paddle.base.libpaddle.pir.create_tensor_dist_attribute( + result.process_mesh, + dim_map, + partial_status, + ) + # insert allreduce by inserting reshard with an empty partial. + comm_op_t = paddle._C_ops.reshard_v2(result, dist_attr_new) + comm_op_t.get_defining_op().op_role = int(OpRole.Forward) + result.replace_all_uses_with(comm_op_t) + comm_op = comm_op_t.get_defining_op() + comm_op.operand(0).set_source(result) + return comm_op + + def _update_before_dims_mapping(self, new_op): + placements = new_op.operand(0).source().placements + stack = [new_op.operand(0).source().get_defining_op()] + # adjust all ops before c_embedding until parameters input + while stack: + op = stack.pop() + operands, results = [], [] + if op.num_results() > 0: + for result, result_dist in zip( + op.results(), op.dist_attr.results() + ): + placements_dist = ( + result_dist.as_tensor_dist_attr().placements + ) + if placements != placements_dist: + dim_map, partial_status = ( + dist.auto_parallel.placement_type.to_dim_map( + placements, result.ndim + ) + ) + dist_attr_new = paddle.base.libpaddle.pir.create_tensor_dist_attribute( + result.process_mesh, + dim_map, + partial_status, + ) + dist_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + result.type(), dist_attr_new + ) + result.set_type(dist_type) + results.append(dist_attr_new) + sub_name = op.name().split('.')[1] + if op.num_operands() > 0: + assert ( + sub_name != "cast" + ), "Need to add support for {sub_name}." + operands.append(dist_attr_new) + next_op = op.operand(0).source().get_defining_op() + stack.append(next_op) + process_mesh = ( + op.results()[0].process_mesh + if op.num_results() > 0 + else op.operand(0).source().process_mesh + ) + op.dist_attr = ( + paddle.base.libpaddle.pir.create_op_dist_attribute( + process_mesh, + operands, + results, + ) + ) + + def _update_after_dims_mapping(self, new_op): + placements = new_op.result(0).placements + pre_id = new_op.id() + stack = list(new_op.result(0).all_used_ops()) + # adjust all ops after c_embedding until the placements are consistent + while stack: + op = stack.pop() + operands, results = [], [] + if op.num_operands() > 0: + for operand, operand_dist in zip( + op.operands_source(), op.dist_attr.operands() + ): + if operand.get_defining_op().id() != pre_id: + continue + placements_dist = ( + operand_dist.as_tensor_dist_attr().placements + ) + if placements != placements_dist: + dim_map, partial_status = ( + dist.auto_parallel.placement_type.to_dim_map( + placements, operand.ndim + ) + ) + dist_attr_new = paddle.base.libpaddle.pir.create_tensor_dist_attribute( + operand.process_mesh, + dim_map, + partial_status, + ) + dist_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + operand.type(), dist_attr_new + ) + operand.set_type(dist_type) + operands.append(dist_attr_new) + sub_name = op.name().split('.')[1] + if sub_name == 'reshard': + # only change reshard‘s inputs + placements_out0 = op.results()[0].placements + dim_map_out0, partial_status_out0 = ( + dist.auto_parallel.placement_type.to_dim_map( + placements_out0, + op.results()[0].ndim, + ) + ) + dist_attr_out0 = paddle.base.libpaddle.pir.create_tensor_dist_attribute( + op.results()[0].process_mesh, + dim_map_out0, + partial_status_out0, + ) + results.append(dist_attr_out0) + elif core.contains_spmd_rule(sub_name): + # redo the infer spmd_rule + rule = core.get_phi_spmd_rule(sub_name) + tensor_dist_attr = TensorDistAttr() + tensor_dist_attr.dims_mapping = dim_map + partial_dims = [] + for i, p in enumerate(placements): + if isinstance(p, dist.Partial): + partial_dims.append(i) + if len(partial_dims) > 0: + tensor_dist_attr._set_partial_dims(partial_dims) + tensor_dist_attr.process_mesh = operand.process_mesh + inputs = DistTensorSpec( + operand.shape, tensor_dist_attr + ) + attr_names = op.get_attr_names() + input_specs = [] + input_specs.append(inputs) + for attr_name in attr_names: + input_specs.append(op.attrs()[attr_name]) + infered_dist_attrs = rule.infer_forward( + *input_specs + ) + dims_mapping_new_out = infered_dist_attrs[1][ + 0 + ].dims_mapping + partial_status = {} + if infered_dist_attrs[1][0]._is_partial(): + partial_dims = infered_dist_attrs[1][ + 0 + ]._partial_dims() + for i in partial_dims: + partial_status[i] = ( + paddle.base.core.ReduceType.kRedSum + ) + dist_attr_new_out = paddle.base.libpaddle.pir.create_tensor_dist_attribute( + operand.process_mesh, + dims_mapping_new_out, + partial_status, + ) + dist_type = ( + paddle.base.libpaddle.pir.cvt_to_dist_type( + op.result(0).type(), dist_attr_new_out + ) + ) + op.result(0).set_type(dist_type) + results.append(dist_attr_new_out) + next_op = op.results()[0].all_used_ops()[0] + stack.append(next_op) + pre_id = op.id() + placements = dist_attr_new_out.placements + else: + results.append(dist_attr_new) + next_op = op.results()[0].all_used_ops()[0] + stack.append(next_op) + pre_id = op.id() + + process_mesh = ( + op.results()[0].process_mesh + if op.num_results() > 0 + else op.operand(0).source().process_mesh + ) + op.dist_attr = ( + paddle.base.libpaddle.pir.create_op_dist_attribute( + process_mesh, + operands, + results, + ) + ) + + def _update_startup_program(self, startup_program, mp_axis): + # modify the startup_program because the optimizer needs to use + startup_block = startup_program.global_block() + for op in startup_block.ops: + if op.name() == 'pd_op.full': + next_op = op.result(0).all_used_ops()[0] + parameter_name = next_op.str_attr("parameter_name") + pattern = re.compile(r'embedding_.*\.w_0\.dist') + if pattern.match(parameter_name): + placements = op.results()[0].placements + dim_map, partial_status = ( + dist.auto_parallel.placement_type.to_dim_map( + placements, len(placements) + ) + ) + dim_map = [mp_axis, -1] + dist_attr = ( + paddle.base.libpaddle.pir.create_tensor_dist_attribute( + op.results()[0].process_mesh, + dim_map, + partial_status, + ) + ) + dist_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + op.results()[0].type(), dist_attr + ) + op.results()[0].set_type(dist_type) + op.dist_attr = ( + paddle.base.libpaddle.pir.create_op_dist_attribute( + op.results()[0].process_mesh, [], [dist_attr] + ) + ) diff --git a/test/auto_parallel/pir/CMakeLists.txt b/test/auto_parallel/pir/CMakeLists.txt index cedf3e53eee33..37703872c6757 100644 --- a/test/auto_parallel/pir/CMakeLists.txt +++ b/test/auto_parallel/pir/CMakeLists.txt @@ -21,6 +21,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU) test_semi_auto_parallel_dist_to_static_pir_decomp MODULES test_semi_auto_parallel_dist_to_static_pir_decomp ENVS FLAGS_enable_pir_api=1 FLAGS_dist_prim_all=1) + py_test_modules( + test_auto_parallel_c_embedding_pass MODULES + test_auto_parallel_c_embedding_pass ENVS FLAGS_enable_pir_api=1 + FLAGS_dist_prim_all=1) py_test_modules(test_reshard MODULES test_reshard ENVS FLAGS_enable_pir_api=1) py_test_modules(test_learning_rate MODULES test_learning_rate ENVS FLAGS_enable_pir_api=1) @@ -32,6 +36,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU) PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 300) set_tests_properties(test_semi_auto_parallel_dist_to_static_pir_decomp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 60) + set_tests_properties(test_auto_parallel_c_embedding_pass + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 300) py_test_modules( test_eliminate_transpose_pass MODULES test_eliminate_transpose_pass ENVS FLAGS_enable_pir_in_executor=1) diff --git a/test/auto_parallel/pir/auto_parallel_c_embedding_pass.py b/test/auto_parallel/pir/auto_parallel_c_embedding_pass.py new file mode 100644 index 0000000000000..7c0ebb7f1083a --- /dev/null +++ b/test/auto_parallel/pir/auto_parallel_c_embedding_pass.py @@ -0,0 +1,201 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import os +import random + +import numpy as np + +import paddle +import paddle.distributed as dist +from paddle import nn +from paddle.distributed import fleet +from paddle.io import DataLoader + +BATCH_SIZE = 2 +SEQ_LEN = 50 +VOCAB_SIZE = 200 +HIDDEN_SIZE = 100 + + +class CEmbeddingNet(nn.Layer): + def __init__(self, mesh): + super().__init__() + self.embedding = fleet.meta_parallel.VocabParallelEmbedding( + VOCAB_SIZE, + HIDDEN_SIZE, + weight_attr=paddle.nn.initializer.Constant(value=0.5), + ) + + def forward(self, x): + x = paddle.to_tensor(x, dtype="int32") + out = self.embedding(x) + out = out.astype(self.embedding.weight.dtype) + out = paddle.transpose(out, [1, 0, 2]) + t = paddle.randn([SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE]) + out = out * t + out = paddle.transpose(out, [1, 0, 2]) + return out + + +class EmbeddingNet(nn.Layer): + def __init__(self, mesh): + super().__init__() + self.embedding = paddle.nn.Embedding( + VOCAB_SIZE, + HIDDEN_SIZE, + weight_attr=paddle.nn.initializer.Constant(value=0.5), + ) + self.mesh_ = mesh + self.embedding.weight = dist.shard_tensor( + self.embedding.weight, + mesh, + [dist.Replicate(), dist.Shard(1)], + stop_gradient=False, + ) + + def forward(self, x): + out = self.embedding(x) + out = out.astype(self.embedding.weight.dtype) + out = paddle.transpose(out, [1, 0, 2]) + out = dist.reshard( + out, self.mesh_, [dist.Replicate(), dist.Replicate()] + ) + t = paddle.randn([SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE]) + out = out * t + out = paddle.transpose(out, [1, 0, 2]) + return out + + +class RandomDataset(paddle.io.Dataset): + def __init__(self, inputs, labels, num_samples): + self.inputs = inputs + self.labels = labels + self.num_samples = num_samples + + def __getitem__(self, idx): + return self.inputs[idx], self.labels[idx] + + def __len__(self): + return self.num_samples + + +class TestSimpleNetForSemiAutoParallel: + def __init__(self): + self._seed = eval(os.getenv("seed")) + self.mesh = dist.ProcessMesh([[0, 1]]) + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 2, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + + def set_random_seed(self, seed): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + def create_data_loader(self): + inputs = np.random.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN)) + labels = np.random.rand(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE).astype( + 'float32' + ) + dataset = RandomDataset(inputs, labels, BATCH_SIZE) + loader = DataLoader(dataset, batch_size=BATCH_SIZE) + return loader + + def run_dy2static(self, layer, opt, dist_loader, use_pass): + loss_fn = nn.MSELoss() + strategy = dist.Strategy() + strategy._mp_optimization.replace_with_c_embedding = use_pass + dist_model = dist.to_static( + layer, dist_loader, loss_fn, opt, strategy=strategy + ) + loss_list = [] + dist_model._engine._mode = "train" + dist_model.train() + dist_program = dist_model._engine._pir_dist_main_progs["train"] + op_name = dist_program.global_block().ops[8].name() + expected_op = 'pd_op.c_embedding' if use_pass else 'pd_op.embedding' + np.testing.assert_equal(op_name, expected_op) + for epoch in range(3): + for batch_id, data in enumerate(dist_loader()): + x, label = data + loss = dist_model(x, label) + loss_list.append(loss) + return np.array(loss_list), dist_model + + def run_dynamic(self, layer, opt, dist_loader): + loss_fn = nn.MSELoss() + loss_list = [] + for epoch in range(3): + for batch_id, data in enumerate(dist_loader()): + x, label = data + out = layer(x) + loss = loss_fn(out, label) + loss_list.append(loss.numpy()) + loss.backward() + opt.step() + opt.clear_grad() + return np.array(loss_list) + + def test_mp_demo_net(self): + paddle.disable_static() + paddle.base.set_flags({'FLAGS_enable_pir_api': 1}) + self.set_random_seed(self._seed) + data_loader = self.create_data_loader() + dist_dataloader = dist.shard_dataloader( + dataloader=data_loader, + meshes=[self.mesh], + ) + self.set_random_seed(self._seed) + dy2static_layer_use_pass = EmbeddingNet(self.mesh) + dy2static_opt_use_pass = paddle.optimizer.AdamW( + learning_rate=0.1, parameters=dy2static_layer_use_pass.parameters() + ) + loss_pass, dist_model_use_pass = self.run_dy2static( + dy2static_layer_use_pass, + dy2static_opt_use_pass, + dist_dataloader, + True, + ) + self.set_random_seed(self._seed) + dy2static_layer = EmbeddingNet(self.mesh) + dy2static_opt = paddle.optimizer.AdamW( + learning_rate=0.1, parameters=dy2static_layer.parameters() + ) + loss_st, dist_model = self.run_dy2static( + dy2static_layer, dy2static_opt, dist_dataloader, False + ) + self.set_random_seed(self._seed) + dy_layer = CEmbeddingNet(self.mesh) + dy_opt = paddle.optimizer.AdamW( + learning_rate=0.1, parameters=dy_layer.parameters() + ) + loss_dy = self.run_dynamic(dy_layer, dy_opt, data_loader) + md5_pass = hashlib.md5(loss_pass.tobytes()).hexdigest() + md5_st = hashlib.md5(loss_st.tobytes()).hexdigest() + md5_dy = hashlib.md5(loss_dy.tobytes()).hexdigest() + np.testing.assert_equal(md5_pass, md5_st) + np.testing.assert_equal(md5_pass, md5_dy) + + def run_test_case(self): + self.test_mp_demo_net() + + +if __name__ == '__main__': + TestSimpleNetForSemiAutoParallel().run_test_case() diff --git a/test/auto_parallel/pir/test_auto_parallel_c_embedding_pass.py b/test/auto_parallel/pir/test_auto_parallel_c_embedding_pass.py new file mode 100644 index 0000000000000..e250426cbf9ce --- /dev/null +++ b/test/auto_parallel/pir/test_auto_parallel_c_embedding_pass.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile +import unittest + +import collective.test_communication_api_base as test_base + + +class TestAutoParallelCEmbeddingPass(test_base.CommunicationTestDistBase): + def setUp(self): + super().setUp( + num_of_devices=2, + timeout=300, + ) + self._default_envs = {"dtype": "float32", "seed": "2024"} + self._changeable_envs = {"backend": ["gpu"]} + + def test_mlp(self): + envs_list = test_base.gen_product_envs_list( + self._default_envs, self._changeable_envs + ) + for envs in envs_list: + ckpt_path_tmp = tempfile.TemporaryDirectory() + envs["ckpt_path"] = ckpt_path_tmp.name + self.run_test_case( + "auto_parallel_c_embedding_pass.py", + user_defined_envs=envs, + ) + ckpt_path_tmp.cleanup() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/auto_parallel/spmd_rules/test_c_embedding_rule.py b/test/auto_parallel/spmd_rules/test_c_embedding_rule.py new file mode 100644 index 0000000000000..213101ea274b3 --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_c_embedding_rule.py @@ -0,0 +1,146 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import OrderedDict + +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto +from paddle.framework import core + + +class TestEmbeddingSPMDRule(unittest.TestCase): + def setUp(self): + self.rule = core.get_phi_spmd_rule("c_embedding") + + def test_c_embedding_infer_forward(self): + # forward setup + table_shape = [512, 768] # [V,H] + x_shape = [4, 1024] # [B,S] + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) + table_tensor_dist_attr = TensorDistAttr() + table_tensor_dist_attr.process_mesh = process_mesh + self.table_dist_tensor_spec = DistTensorSpec( + table_shape, table_tensor_dist_attr + ) + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + self.attrs = OrderedDict([('start_index', 0), ('vocab_size', -1)]) + + # data parallel + self.table_dist_tensor_spec.set_dims_mapping([-1, -1]) + self.x_dist_tensor_spec.set_dims_mapping([1, -1]) + result_dist_attrs = self.rule.infer_forward( + self.table_dist_tensor_spec, + self.x_dist_tensor_spec, + self.attrs['start_index'], + self.attrs['vocab_size'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 2) + self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1, -1]) + + # table row-wise parallel + self.table_dist_tensor_spec.set_dims_mapping([1, -1]) + self.x_dist_tensor_spec.set_dims_mapping([-1, -1]) + result_dist_attrs = self.rule.infer_forward( + self.table_dist_tensor_spec, + self.x_dist_tensor_spec, + self.attrs['start_index'], + self.attrs['vocab_size'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1] + ) + self.assertEqual(infered_output_dist_attrs[0]._is_partial(), True) + self.assertEqual(infered_output_dist_attrs[0]._partial_dims(), {1}) + + def test_c_embedding_infer_backward(self): + # backward setup + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) + table_shape = [512, 768] # [V,H] + x_shape = [4, 1024] # [B,S] + table_tensor_dist_attr = TensorDistAttr() + table_tensor_dist_attr.process_mesh = process_mesh + self.table_dist_tensor_spec = DistTensorSpec( + table_shape, table_tensor_dist_attr + ) + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + out_shape = [4, 1024, 768] # [B,S,H] + out_tensor_dist_attr = TensorDistAttr() + out_tensor_dist_attr.process_mesh = process_mesh + self.out_dist_tensor_spec = DistTensorSpec( + out_shape, out_tensor_dist_attr + ) + self.attrs = OrderedDict([('start_index', 0), ('vocab_size', -1)]) + + # table row-wise parallel + self.table_dist_tensor_spec.set_dims_mapping([1, -1]) + self.x_dist_tensor_spec.set_dims_mapping([-1, -1]) + self.out_dist_tensor_spec.set_dims_mapping([-1, -1, -1]) + result_dist_attrs = self.rule.infer_backward( + self.table_dist_tensor_spec, + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['start_index'], + self.attrs['vocab_size'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 3) + self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1]) + self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1, -1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1]) + + # data parallel + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + self.out_dist_tensor_spec.set_dims_mapping([0, -1, -1]) + result_dist_attrs = self.rule.infer_backward( + self.table_dist_tensor_spec, + self.x_dist_tensor_spec, + self.out_dist_tensor_spec, + self.attrs['start_index'], + self.attrs['vocab_size'], + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 3) + self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1]) + self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0, -1]) + self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1]) + + +if __name__ == "__main__": + unittest.main() From c50b16e1aa74b7136a20d2fa545a35b02258f8b8 Mon Sep 17 00:00:00 2001 From: crazyxiaoxi <113622186+crazyxiaoxi@users.noreply.github.com> Date: Sat, 12 Oct 2024 14:23:48 +0800 Subject: [PATCH 089/135] =?UTF-8?q?[CINN]=20=E3=80=90Infer=20Symbolic=20Sh?= =?UTF-8?q?ape=20BUAA=20=E3=80=91Add=20flash=5Fattn=5Fvarlen=5Fqkvpacked?= =?UTF-8?q?=20op=20=20(#68318)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * try * try * fix bug * build fix * fix * comment --- .../multiary_infer_sym.cc | 75 +++++++++++++++++-- .../infer_symbolic_shape/multiary_infer_sym.h | 2 +- paddle/phi/ops/yaml/ops.yaml | 2 +- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index c9138af11ff60..9862430317afd 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -1671,12 +1671,75 @@ bool FusedFeedforwardOpInferSymbolicShape( // return true; // } -// bool FlashAttnQkvpackedOpInferSymbolicShape(pir::Operation *op, -// pir::InferSymbolicShapeContext -// *infer_context) { -// // pass -// return true; -// } +bool FlashAttnVarlenQkvpackedOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &qkv_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + const std::vector &qkv_shape = qkv_shape_or_data.shape(); + + auto round_multiple = [](symbol::DimExpr x) { + auto m = symbol::DimExpr{128}; + auto m_minus_one = symbol::DimExpr{127}; + return (x + m_minus_one) / m * m; + }; + + size_t rank = qkv_shape.size(); + PADDLE_ENFORCE_EQ((rank == 4UL || rank == 5UL), + true, + common::errors::InvalidArgument( + "qkv dims must be 4(unpadded) or 5(padded batch)")); + std::vector out_dims; + std::vector softmax_lse_shape; + std::vector softmax_shape; + if (rank == 4UL) { + // qkv [total_*,nheads/nheads_k+2,nheads_k,headdim] + out_dims = {qkv_shape[0], (qkv_shape[1] - 2) * qkv_shape[2], qkv_shape[3]}; + softmax_shape = {qkv_shape[0], + (qkv_shape[1] - 2) * qkv_shape[2], + infer_context->GetNextSymName(), + infer_context->GetNextSymName()}; + softmax_lse_shape = {qkv_shape[0], + (qkv_shape[1] - 2) * qkv_shape[2], + infer_context->GetNextSymName()}; + } else if (rank == 5UL) { + // qkv [batchsize,seqlen,nheads/nheads_k+2,nheads_k,headdim] + out_dims = {qkv_shape[0], + qkv_shape[1], + (qkv_shape[2] - 2) * qkv_shape[3], + qkv_shape[4]}; + softmax_shape = {qkv_shape[0], + (qkv_shape[2] - 2) * qkv_shape[3], + round_multiple(qkv_shape[1]), + infer_context->GetNextSymName()}; + softmax_lse_shape = {qkv_shape[0], + (qkv_shape[2] - 2) * qkv_shape[3], + round_multiple(qkv_shape[1])}; + } + + // Set output tensor shapes + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(out_dims)}); + + infer_context->SetShapeOrDataForValue( + op->result(1), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs({softmax_shape})}); + + infer_context->SetShapeOrDataForValue( + op->result(2), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(softmax_lse_shape)}); + + if (!paddle::dialect::details::IsFakeValue(op->result(3))) { + std::vector seed_offset_dims = {symbol::DimExpr(2)}; + infer_context->SetShapeOrDataForValue( + op->result(3), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(seed_offset_dims)}); + } + return true; +} // bool FlashAttnUnpaddedOpInferSymbolicShape(pir::Operation *op, // pir::InferSymbolicShapeContext diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h index 28e9e1861ed31..02dda29cd5a06 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h @@ -55,7 +55,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(FullWithTensor) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttn) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedFeedforward) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedAttention) -// OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnQkvpacked) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnVarlenQkvpacked) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnUnpadded) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedBatchNormAct) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedBatchNormAct_) diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 3ec01a70bd55a..4e75631cd508b 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -1925,7 +1925,7 @@ data_type : qkv intermediate : softmax_lse, seed_offset backward : flash_attn_varlen_qkvpacked_grad - # interfaces : paddle::dialect::InferSymbolicShapeInterface + interfaces : paddle::dialect::InferSymbolicShapeInterface - op : flashmask_attention args : (Tensor q, Tensor k, Tensor v, Tensor startend_row_indices, Tensor fixed_seed_offset, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "") From bed96c68d398006322c33040cf69f3017026fcab Mon Sep 17 00:00:00 2001 From: cubehan3 Date: Sat, 12 Oct 2024 15:41:10 +0800 Subject: [PATCH 090/135] [Pir] Add paddle::dialect::ForwardOnlyTrait (#68580) * add for * add forward only trait for static_ops.yaml * add forward only trait for dygraph_ops.yaml * fixed bugs * add ForwardOnlyTrait for unique in static_ops.yaml --- .../ops/yaml/inconsistent/dygraph_ops.yaml | 4 + .../phi/ops/yaml/inconsistent/static_ops.yaml | 1 + paddle/phi/ops/yaml/legacy/static_ops.yaml | 29 ++++ paddle/phi/ops/yaml/ops.yaml | 151 +++++++++++++++--- 4 files changed, 166 insertions(+), 19 deletions(-) diff --git a/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml b/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml index 0bc5f97ba1213..5cc494204f5f2 100755 --- a/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/dygraph_ops.yaml @@ -133,6 +133,7 @@ kernel : func : embedding_grad data_type : weight + traits : paddle::dialect::ForwardOnlyTrait - op : equal args : (Tensor x, Tensor y) @@ -165,6 +166,7 @@ data_type : params optional : skip_update, master_params inplace : (params -> params_out), (moments1 -> moments1_out), (moments2 -> moments2_out), (beta1_pows -> beta1_pows_out), (beta2_pows -> beta2_pows_out), (master_params -> master_params_out) + traits : paddle::dialect::ForwardOnlyTrait - op : fused_gemm_epilogue args : (Tensor x, Tensor y, Tensor bias, bool trans_x, bool trans_y, str activation) @@ -338,6 +340,7 @@ kernel : func : sync_comm_stream data_type : DataType::FLOAT32 + traits : paddle::dialect::ForwardOnlyTrait - op : tile args : (Tensor x, IntArray repeat_times = {}) @@ -359,3 +362,4 @@ func : unique data_type : x optional : indices, inverse, counts + traits : paddle::dialect::ForwardOnlyTrait diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index 0a7b706ea40e0..fd52f7d48418b 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -1012,6 +1012,7 @@ data_type : x interfaces : paddle::dialect::ParseKernelKeyInterface interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : write_to_array args : (Tensor i, Tensor x) diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml index 8705103f1f48c..d8b01bf571e32 100755 --- a/paddle/phi/ops/yaml/legacy/static_ops.yaml +++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml @@ -16,6 +16,7 @@ kernel : func : all_reduce param: [x, reduce_type] + traits : paddle::dialect::ForwardOnlyTrait - op : amax args : (Tensor x, IntArray axis={0}, bool keepdim=false, bool reduce_all=false, int in_dtype=-1, int out_dtype=-1) @@ -91,6 +92,7 @@ func: BeamSearchDecodeInferMeta kernel: func: beam_search_decode + traits : paddle::dialect::ForwardOnlyTrait - op : broadcast args : (Tensor x, int ring_id = 0, int root = 0) @@ -101,6 +103,7 @@ kernel : func : broadcast param: [x, root] + traits : paddle::dialect::ForwardOnlyTrait - op : comm_init_all args : (int[] devices={}, int ring_id=0) @@ -111,6 +114,7 @@ kernel : func : comm_init_all data_type : DataType::FLOAT32 + traits : paddle::dialect::ForwardOnlyTrait - op : conv2d_transpose args : (Tensor x, Tensor filter, Tensor bias, int[] strides={1, 1}, int[] paddings={0, 0}, int[] output_padding={}, IntArray output_size={}, str padding_algorithm="EXPLICIT", int groups=1, int[] dilations={1, 1}, str data_format="NCHW") @@ -135,6 +139,7 @@ func : conv2d_transpose_bias param : [x, filter, bias, strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format] data_type : x + traits : paddle::dialect::ForwardOnlyTrait - op : cross_entropy args: (Tensor x, Tensor label, bool soft_label = false, int ignore_index = -100) @@ -165,6 +170,7 @@ kernel : func : decode_jpeg param : [x, mode] + traits : paddle::dialect::ForwardOnlyTrait - op : deformable_conv args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides={1, 1}, int[] paddings={0, 0}, int[] dilations={1, 1}, int deformable_groups=1, int groups=1, int im2col_step=64) @@ -198,6 +204,7 @@ kernel : func : dist_concat param: [x, nranks] + traits : paddle::dialect::ForwardOnlyTrait - op : einsum args : (Tensor[] x, str equation) @@ -242,6 +249,7 @@ func : empty param : [shape, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : equal args : (Tensor x, Tensor y, int axis = -1, bool force_cpu=false) @@ -277,6 +285,7 @@ func : eye param : [num_rows, num_columns, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : fetch_barrier args: (Tensor[] x, int trainer_id = 0, str[] endpoints = {"127.0.0.1:6164"}) @@ -286,6 +295,7 @@ kernel: func: fetch_barrier optional: x + traits : paddle::dialect::ForwardOnlyTrait - op : flatten args : (Tensor x, int start_axis, int stop_axis) @@ -401,6 +411,7 @@ func: hash param: [x, num_hash, mod_by] data_type: x + traits : paddle::dialect::ForwardOnlyTrait - op : legacy_bilinear_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_format="NCHW", int out_d=0, int out_h=0, int out_w=0, float scale=0.0, str interp_method="bilinear", bool align_corners=true, int align_mode=1) @@ -448,6 +459,7 @@ func: legacy_generate_proposals data_type: anchors optional: rpn_rois_num + traits : paddle::dialect::ForwardOnlyTrait - op : legacy_nearest_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_format="NCHW", int out_d=0, int out_h=0, int out_w=0, float scale=0.0, str interp_method="bilinear", bool align_corners=true, int align_mode=1) @@ -499,6 +511,7 @@ func : linspace param: [start, stop, number, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : lrn args: (Tensor x, int n = 5, float k = 2.0, float alpha = 0.0001, float beta = 0.75, str data_format = "AnyLayout") @@ -542,6 +555,7 @@ func : matrix_rank {dense -> dense}, matrix_rank_tol {dense, dense -> dense} data_type : x + traits : paddle::dialect::ForwardOnlyTrait - op : max args : (Tensor x, IntArray axis={0}, bool keepdim=false, bool reduce_all=false, int in_dtype=-1, int out_dtype=-1) @@ -626,6 +640,7 @@ func : p_recv param : [peer, dtype, dynamic_shape] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : p_recv_array args : (int ring_id = 0, int peer = 0, DataType dtype = DataType::FLOAT32, int[] out_shape = {}) @@ -636,6 +651,7 @@ kernel : func : p_recv_array param : [peer, dtype, out_shape] + traits : paddle::dialect::ForwardOnlyTrait - op : p_send args : (Tensor x, int ring_id = 0, int peer = 0, bool dynamic_shape = false) @@ -647,6 +663,7 @@ func : p_send param : [x, peer, dynamic_shape] data_type : x + traits : paddle::dialect::ForwardOnlyTrait - op : p_send_array args : (Tensor x, int ring_id = 0, int peer = 0, bool dynamic_shape = false) @@ -658,6 +675,7 @@ func : p_send_array param : [x, peer, dynamic_shape] data_type : x + traits : paddle::dialect::ForwardOnlyTrait - op : pool2d args : (Tensor x, IntArray kernel_size, int[] strides = {1,1}, int[] paddings = {0,0}, bool ceil_mode = false, bool exclusive = true, str data_format = "NCHW", str pooling_type = "", bool global_pooling = false, bool adaptive = false, str padding_algorithm = "EXPLICIT", bool use_cudnn = false) @@ -701,6 +719,7 @@ func: QuantLinearInferMeta kernel: func: quant_linear + traits : paddle::dialect::ForwardOnlyTrait - op : randint args : (int low, int high, IntArray shape = {}, DataType dtype = DataType::INT64, int seed = 0) @@ -724,6 +743,7 @@ func : randperm param : [n, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : reduce args : (Tensor x, int ring_id = 0, int root_id = 0, int reduce_type = 0) @@ -734,6 +754,7 @@ kernel : func : reduce param: [x, root_id, reduce_type] + traits : paddle::dialect::ForwardOnlyTrait - op : remainder args : (Tensor x, Tensor y, int axis = -1) @@ -799,6 +820,7 @@ kernel: func : shadow_output param : [x] + traits : paddle::dialect::ForwardOnlyTrait - op : share_buffer args : (Tensor[] x, bool[] share_dims_and_dtype={}) @@ -807,6 +829,7 @@ func : ShareBufferInferMeta kernel : func : share_buffer + traits : paddle::dialect::ForwardOnlyTrait - op : softmax args : (Tensor x, int axis = -1) @@ -828,6 +851,7 @@ func: sparse_momentum data_type: param optional: master_param, master_param_out + traits : paddle::dialect::ForwardOnlyTrait - op : squeeze args : (Tensor x, IntArray axis={}) @@ -894,6 +918,7 @@ func: TransferLayoutInferMeta kernel: func: transfer_layout + traits : paddle::dialect::ForwardOnlyTrait - op : tril_indices args : (int rows = 0, int cols = 0, int offset = 0, DataType dtype = DataType::INT64) @@ -905,6 +930,7 @@ func : tril_indices param : [rows, cols, offset, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : tril_triu args : (Tensor x, int diagonal = 0, bool lower = false) @@ -925,6 +951,7 @@ func : triu_indices param : [row, col, offset, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : truncated_gaussian_random args : (int[] shape, float mean = .0f, float std = 1.0f, int seed = 0, float a = -2.0f, float b = 2.0f, DataType dtype=DataType::FLOAT32) @@ -936,6 +963,7 @@ func : truncated_gaussian_random param : [shape, mean, std, seed, a, b, dtype] data_type : dtype + traits : paddle::dialect::ForwardOnlyTrait - op : uniform args : (IntArray shape = {}, DataType dtype = DataType::FLOAT32, Scalar min = -1.0f, Scalar max = 1.0f, int seed = 0, int diag_num = 0, int diag_step = 0, float diag_val = 1.0f) @@ -997,3 +1025,4 @@ kernel: func: multiclass_nms data_type: scores + traits : paddle::dialect::ForwardOnlyTrait diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 4e75631cd508b..fe6aa5fc618df 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -38,6 +38,7 @@ func : accuracy_check data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : acos args : (Tensor x) @@ -73,6 +74,7 @@ data_type : param optional : master_param, master_param_out inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out) + traits : paddle::dialect::ForwardOnlyTrait - op : adagrad_ args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon = 1.0e-6f, bool multi_precision = false) @@ -85,7 +87,7 @@ data_type : param optional : master_param, master_param_out inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : adam_ args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, bool lazy_mode = false, int64_t min_row_size_to_use_multithread = 1000, bool multi_precision = false, bool use_global_beta_pow = false) @@ -99,7 +101,7 @@ data_type : param optional : master_param, skip_update, master_param_out inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : adamax_ args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, Tensor master_param, float beta1 = 0.9f, float beta2 = 0.999f, float epsilon = 1.0e-8f, bool multi_precision = false) @@ -111,7 +113,7 @@ data_type : param optional : master_param, master_param_out inplace : (param -> param_out), (moment -> moment_out), (inf_norm -> inf_norm_out), (master_param ->master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : adamw_ args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, float lr_ratio = 1.0f, float coeff = 0.01f, bool with_decay = false, bool lazy_mode = false, int64_t min_row_size_to_use_multithread = 1000, bool multi_precision = false, bool use_global_beta_pow = false) @@ -124,7 +126,7 @@ data_type : param optional : master_param, skip_update, master_param_out inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : add_position_encoding args: (Tensor x, float alpha = 1.0f, float beta = 1.0f) @@ -191,6 +193,7 @@ kernel : func : all_gather param: [x, nranks] + traits : paddle::dialect::ForwardOnlyTrait - op : all_to_all args : (Tensor x, int ring_id = 0) @@ -263,6 +266,7 @@ func : apply_per_channel_scale data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : argmax args : (Tensor x, Scalar(int64_t) axis, bool keepdims = false, bool flatten = false, DataType dtype = DataType::INT64) @@ -341,7 +345,7 @@ support_trans_dtype : learning_rate, n optional : master_param, master_param_out inplace : (param -> param_out), (d -> d_out), (y -> y_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : asin args : (Tensor x) @@ -385,6 +389,7 @@ func : AssignPosInferMeta kernel : func : assign_pos + traits : paddle::dialect::ForwardOnlyTrait - op : assign_value_ args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {}) @@ -447,6 +452,7 @@ data_type: x optional: h0, attention_bias, attention_scalar, attention_scalar_bias intermediate: attentioned_x, attention_fc_out, lstm_x, lstm_out + traits : paddle::dialect::ForwardOnlyTrait - op : auc args : (Tensor x, Tensor label, Tensor stat_pos, Tensor stat_neg, Tensor ins_tag_weight, str curve = "ROC", int num_thresholds = (2 << 12) - 1, int slide_steps = 1) @@ -458,6 +464,7 @@ data_type : x optional : ins_tag_weight interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : average_accumulates_ args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window = 0, int64_t max_average_window = INT64_MAX, int64_t min_average_window = 10000L) @@ -468,6 +475,7 @@ func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense} data_type : param inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) + traits : paddle::dialect::ForwardOnlyTrait - op : batch_fc args : (Tensor input, Tensor w, Tensor bias) @@ -502,6 +510,7 @@ func: beam_search data_type: pre_ids optional: ids, parent_idx + traits : paddle::dialect::ForwardOnlyTrait - op : bernoulli args : (Tensor x) @@ -571,6 +580,7 @@ kernel : func : binomial interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : bipartite_match args: (Tensor dist_mat, str match_type = "bipartite", float dist_threshold = 0.5) @@ -581,6 +591,7 @@ func: bipartite_match data_type: dist_mat interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : bitwise_and args : (Tensor x, Tensor y) @@ -674,6 +685,7 @@ kernel: func: box_clip interfaces: paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : box_coder args : (Tensor prior_box, Tensor prior_box_var, Tensor target_box, str code_type = "encode_center_size", bool box_normalized = true, int axis = 0, float[] variance = {}) @@ -684,6 +696,7 @@ func : box_coder optional : prior_box_var interfaces: paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : broadcast_tensors args: (Tensor[] input) @@ -704,6 +717,7 @@ param: [x, nranks] kernel : func : c_allgather + traits : paddle::dialect::ForwardOnlyTrait - op : c_allreduce_max args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel) @@ -714,6 +728,7 @@ kernel : func : c_allreduce_max inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_allreduce_min args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel) @@ -724,6 +739,7 @@ kernel : func : c_allreduce_min inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_allreduce_prod args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel) @@ -734,6 +750,7 @@ kernel : func : c_allreduce_prod inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_allreduce_sum args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel) @@ -744,6 +761,7 @@ kernel : func : c_allreduce_sum inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_broadcast args : (Tensor x, int ring_id=0, int root=0, bool use_calc_stream=false) @@ -754,6 +772,7 @@ kernel : func : c_broadcast inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_concat args : (Tensor x, int rank, int nranks, int ring_id, bool use_calc_stream, bool use_model_parallel) @@ -763,6 +782,7 @@ param : [x, nranks] kernel : func : c_concat + traits : paddle::dialect::ForwardOnlyTrait - op : c_identity args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel) @@ -772,6 +792,7 @@ kernel : func : c_identity inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_reduce_sum args : (Tensor x, int ring_id, int root_id, bool use_calc_stream) @@ -782,6 +803,7 @@ kernel : func : c_reduce_sum inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_scatter args : (Tensor x, int ring_id = 0, int root = 0, int nranks = 0, bool use_calc_stream = false) @@ -791,6 +813,7 @@ param : [x, ring_id, root, nranks] kernel : func : c_scatter + traits : paddle::dialect::ForwardOnlyTrait - op : c_sync_calc_stream args : (Tensor x) @@ -801,6 +824,7 @@ kernel : func : c_sync_calc_stream inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : c_sync_comm_stream args : (Tensor x, int ring_id) @@ -811,6 +835,7 @@ kernel : func : c_sync_comm_stream inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : calc_reduced_attn_scores args : (Tensor q, Tensor k, Tensor softmax_lse) @@ -821,6 +846,7 @@ kernel : func : calc_reduced_attn_scores data_type : q + traits : paddle::dialect::ForwardOnlyTrait - op : cast args : (Tensor x, DataType dtype) @@ -881,6 +907,7 @@ data_type : x inplace : (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : check_numerics args : (Tensor tensor, str op_type = "", str var_name = "", int check_nan_inf_level = 0, int stack_height_limit = -1, str output_dir = "") @@ -890,6 +917,7 @@ kernel : func : check_numerics interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : cholesky args : (Tensor x, bool upper=false) @@ -918,7 +946,7 @@ kernel : func : class_center_sample data_type : label - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait interfaces : paddle::dialect::InferSymbolicShapeInterface - op : clip @@ -943,6 +971,7 @@ func : clip_by_norm {dense -> dense} clip_by_norm_sr {selected_rows -> selected_rows} interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : coalesce_tensor args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {}) @@ -953,6 +982,7 @@ func : coalesce_tensor data_type : dtype interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : collect_fpn_proposals args: (Tensor[] multi_level_rois, Tensor[] multi_level_scores, Tensor[] multi_level_rois_num, @@ -964,6 +994,7 @@ func: collect_fpn_proposals data_type: multi_level_rois optional: multi_level_rois_num, rois_num + traits : paddle::dialect::ForwardOnlyTrait - op : complex args : (Tensor real, Tensor imag) @@ -1031,6 +1062,7 @@ kernel : func : conv2d_transpose_bias data_type : x + traits : paddle::dialect::ForwardOnlyTrait # interfaces : paddle::dialect::InferSymbolicShapeInterface - op : conv3d @@ -1060,6 +1092,7 @@ output : Tensor(out) invoke : copy_to_impl(x, place, blocking) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : copysign args : (Tensor x, Tensor y) @@ -1116,6 +1149,7 @@ data_type: emission optional: label, length interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : crop args : (Tensor x, IntArray shape = {}, IntArray offsets = {}) @@ -1164,6 +1198,7 @@ data_type: input optional: input_length, output_length interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : cudnn_lstm args: (Tensor x, Tensor init_h, Tensor init_c, Tensor w, Tensor[] weight_list, Tensor sequence_length, float dropout_prob = 0.0, bool is_bidirec = false, int hidden_size = 100, int num_layers = 1, bool is_test = false, int seed = 0) @@ -1248,6 +1283,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : decayed_adagrad args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f) @@ -1257,6 +1293,7 @@ kernel : func : decayed_adagrad data_type : param + traits : paddle::dialect::ForwardOnlyTrait - op : decode_jpeg args : (Tensor x, str mode, Place place) @@ -1269,6 +1306,7 @@ param : [x, mode] backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : deformable_conv args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) @@ -1291,6 +1329,7 @@ kernel: func: depend interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : depthwise_conv2d args : (Tensor input, Tensor filter, int[] strides={1, 1}, int[] paddings={0, 0}, str padding_algorithm="EXPLICIT", int groups=1, int[] dilations={1, 1}, str data_format="NCHW") @@ -1323,6 +1362,7 @@ func : dequantize_abs_max data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : dequantize_log args: (Tensor x, Tensor dict) @@ -1333,6 +1373,7 @@ func: dequantize_log data_type: x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : det args : (Tensor x) @@ -1356,6 +1397,7 @@ data_type: detect_res optional: has_state, pos_count, true_pos, false_pos interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : dgc args : (Tensor u, Tensor v, Tensor grad, Tensor param, Tensor current_step, Tensor nranks, float m=0.9, bool use_nesterov=true, float[] sparsity={}, float rampup_begin_step=0.0, float rampup_step=0.0, float regular_coeff=0.0, int regular_type=0) @@ -1369,6 +1411,7 @@ optional: param data_transform : skip_transform : current_step, nranks + traits : paddle::dialect::ForwardOnlyTrait - op : dgc_clip_by_norm args: (Tensor x, Tensor current_step, float max_norm, float rampup_begin_step = -1.0) @@ -1381,6 +1424,7 @@ dgc_clip_by_norm_sr {selected_rows, dense -> selected_rows} data_transform : skip_transform : current_step + traits : paddle::dialect::ForwardOnlyTrait - op : dgc_momentum args: (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor @@ -1398,6 +1442,7 @@ optional : master_param, master_param_out data_transform : skip_transform : current_step_tensor, nranks_tensor + traits : paddle::dialect::ForwardOnlyTrait - op : diag args : (Tensor x, int offset = 0, float padding_value = 0.0) @@ -1448,6 +1493,7 @@ kernel: func: dirichlet interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : disable_check_model_nan_inf args: (Tensor x, int flag = 0) @@ -1490,6 +1536,7 @@ kernel: func: dpsgd data_type: param + traits : paddle::dialect::ForwardOnlyTrait - op : dropout args : (Tensor x, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) @@ -1515,6 +1562,7 @@ data_type : DataType::FLOAT32 optional : hypslength, refslength interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : eig args: (Tensor x) @@ -1544,6 +1592,7 @@ kernel : func : eigvals interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : eigvalsh args : (Tensor x, str uplo = "L", bool is_test = false) @@ -1580,6 +1629,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : empty_like args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {}) @@ -1593,6 +1643,7 @@ data_type : dtype > x backend : place > x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : enable_check_model_nan_inf args: (Tensor x, int flag = 1) @@ -1708,6 +1759,7 @@ param : [num_rows, num_columns, dtype] data_type : dtype backend : place + traits : paddle::dialect::ForwardOnlyTrait - op : fake_channel_wise_dequantize_max_abs args : (Tensor x, Tensor[] scales, int[] quant_bits = {8}, int quant_axis = 0, int x_num_col_dims = 1) @@ -1718,6 +1770,7 @@ func : fake_channel_wise_dequantize_max_abs data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fake_channel_wise_quantize_abs_max args : (Tensor x, int bit_length = 8, int round_type = 1, int quant_axis = 0, bool is_test = false) @@ -1728,6 +1781,7 @@ func : fake_channel_wise_quantize_abs_max data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fake_channel_wise_quantize_dequantize_abs_max args : (Tensor x, int bit_length = 8, int round_type = 1, int quant_axis = 0) @@ -1749,6 +1803,7 @@ func : fake_dequantize_max_abs data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fake_quantize_abs_max args : (Tensor x, int bit_length = 8, int round_type = 1) @@ -1759,6 +1814,7 @@ func : fake_quantize_abs_max data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fake_quantize_dequantize_abs_max args : (Tensor x, int bit_length = 8, int round_type = 1) @@ -1795,6 +1851,7 @@ optional : in_accum, in_state, out_state, out_accum inplace: (in_scale -> out_scale) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fake_quantize_range_abs_max args : (Tensor x, Tensor in_scale, Tensor iter, int window_size = 10000, int bit_length = 8, bool is_test = false, int round_type = 1) @@ -1807,6 +1864,7 @@ optional : iter, out_scales inplace: (in_scale -> out_scale) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fft_c2c args : (Tensor x, int64_t[] axes, str normalization, bool forward) @@ -2055,6 +2113,7 @@ func: ftrl {dense, dense, dense, dense, dense -> dense, dense, dense} ftrl_sr {dense, dense, dense, selected_rows, dense -> dense, dense, dense} data_type: param + traits : paddle::dialect::ForwardOnlyTrait - op : full args : (IntArray shape, Scalar(double) value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) @@ -2068,6 +2127,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : full_ args : (Tensor output, IntArray shape, Scalar(double) value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) @@ -2081,6 +2141,7 @@ param : [shape, value, dtype] data_type : dtype backend : place + traits : paddle::dialect::ForwardOnlyTrait - op : full_batch_size_like args : (Tensor input, int[] shape, DataType dtype, Scalar(double) value, int input_dim_idx, int output_dim_idx, Place place=CPUPlace()) @@ -2094,6 +2155,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : full_int_array args : (int64_t[] value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) @@ -2107,6 +2169,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : full_like args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {}) @@ -2173,6 +2236,7 @@ func : fused_multi_transformer data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : fused_softmax_mask args : (Tensor x, Tensor mask) @@ -2250,6 +2314,7 @@ func : gather_tree data_type : ids interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : gaussian args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) @@ -2300,6 +2365,7 @@ data_type : anchors optional : rpn_rois_num interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : graph_khop_sampler args : (Tensor row, Tensor colptr, Tensor x, Tensor eids, int[] sample_sizes, bool return_eids) @@ -2311,6 +2377,7 @@ data_type : row optional : eids interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : graph_sample_neighbors args : (Tensor row, Tensor colptr, Tensor x, Tensor eids, Tensor perm_buffer, int sample_size, bool return_eids, bool flag_perm_buffer) @@ -2322,6 +2389,7 @@ data_type : row optional : eids, perm_buffer interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : grid_sample args : (Tensor x, Tensor grid, str mode = "bilinear", str padding_mode = "zeros", bool align_corners = true) @@ -2554,6 +2622,7 @@ func : increment inplace : (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : index_add args : (Tensor x, Tensor index, Tensor add_value, int axis = 0) @@ -2759,7 +2828,7 @@ data_type : param optional : master_param, skip_update, beta1_pow_out, beta2_pow_out, master_param_outs inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : layer_norm args : (Tensor x, Tensor scale, Tensor bias, float epsilon = 1e-5, int begin_norm_axis = 1) @@ -2817,6 +2886,7 @@ kernel : func : limit_by_capacity data_type : expert_count + traits : paddle::dialect::ForwardOnlyTrait - op : linear_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_format="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) @@ -2855,6 +2925,7 @@ func : llm_int8_linear data_type : x optional: bias + traits : paddle::dialect::ForwardOnlyTrait - op : log args : (Tensor x) @@ -3020,6 +3091,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : logsumexp args : (Tensor x, int[] axis={0}, bool keepdim=false, bool reduce_all=false) @@ -3040,6 +3112,7 @@ kernel: func: lookup_table_dequant data_type: w + traits : paddle::dialect::ForwardOnlyTrait - op : lp_pool2d args : (Tensor x, IntArray kernel_size, int[] strides = {1,1}, int[] paddings = {0,0}, bool ceil_mode = false, bool exclusive = true, str data_format = "NCHW", str pooling_type = "", bool global_pooling = false, bool adaptive = false, str padding_algorithm = "EXPLICIT", float norm_type = 0.0f) @@ -3076,6 +3149,7 @@ func : lstsq data_type : x optional : residuals + traits : paddle::dialect::ForwardOnlyTrait - op : lu args : (Tensor x, bool pivot = true) @@ -3122,6 +3196,7 @@ optional : bias, src_mask, cum_offsets, sequence_lengths, rotary_tensor, beam_cache_offset, qkv_out_scale, out_shift, out_smooth inplace : (cache_kv -> cache_kv_out), (beam_cache_offset -> beam_cache_offset_out) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : masked_select args : (Tensor x, Tensor mask) @@ -3153,6 +3228,7 @@ kernel : func : matrix_nms interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : matrix_power args : (Tensor x, int n) @@ -3173,6 +3249,7 @@ kernel : func : matrix_rank interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : matrix_rank_atol_rtol args : (Tensor x, Tensor atol, Tensor rtol, bool hermitian=false) @@ -3182,6 +3259,7 @@ kernel : func : matrix_rank_atol_rtol optional : rtol + traits : paddle::dialect::ForwardOnlyTrait - op : matrix_rank_tol args : (Tensor x, Tensor atol_tensor, bool use_default_tol=true, bool hermitian=false) @@ -3190,6 +3268,7 @@ func : MatrixRankTolInferMeta kernel : func : matrix_rank_tol + traits : paddle::dialect::ForwardOnlyTrait - op : max args : (Tensor x, IntArray axis={}, bool keepdim=false) @@ -3261,6 +3340,7 @@ param : [x] kernel : func : memcpy_d2h + traits : paddle::dialect::ForwardOnlyTrait - op : memcpy_h2d args : (Tensor x, int dst_place_type) @@ -3270,6 +3350,7 @@ param : [x] kernel : func : memcpy_h2d + traits : paddle::dialect::ForwardOnlyTrait - op : memory_efficient_attention args : (Tensor query, Tensor key, Tensor value, Tensor bias, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor causal_diagonal, Tensor seqlen_k, Scalar max_seqlen_q, Scalar max_seqlen_k, bool causal, double dropout_p, float scale, bool is_test) @@ -3291,6 +3372,7 @@ kernel : func : merge_selected_rows {selected_rows -> selected_rows} interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : merged_adam_ args : (Tensor[] param, Tensor[] grad, Tensor[] learning_rate, Tensor[] moment1, Tensor[] moment2, Tensor[] beta1_pow, Tensor[] beta2_pow, Tensor[] master_param, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, bool multi_precision = false, bool use_global_beta_pow = false) @@ -3302,7 +3384,7 @@ data_type : param optional: master_param, master_param_out inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : merged_momentum_ args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f) @@ -3314,7 +3396,7 @@ data_type : param optional: master_param, master_param_out inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : meshgrid args : (Tensor[] inputs) @@ -3359,7 +3441,7 @@ data_type : param optional : master_param, master_param_out inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : multi_dot args : (Tensor[] x) @@ -3381,6 +3463,7 @@ data_type : scores optional : rois_num, nms_rois_num interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : multinomial args : (Tensor x, Scalar(int) num_samples = 1, bool replacement = false) @@ -3392,6 +3475,7 @@ data_type : x traits : paddle::dialect::ForwardOnlyTrait interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : multiplex args : (Tensor[] inputs, Tensor index) @@ -3426,7 +3510,7 @@ data_type : param optional : master_param, master_param_out inplace : (param -> param_out), (momentum_decay_pow -> momentum_decay_pow_out), (beta2_pow -> beta2_pow_out), (mu_product -> mu_product_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (master_param->master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : nanmedian args : (Tensor x, IntArray axis = {}, bool keepdim = true, str mode="avg") @@ -3484,6 +3568,7 @@ kernel : func : nms data_type : x + traits : paddle::dialect::ForwardOnlyTrait - op : nonzero args : (Tensor condition) @@ -3515,6 +3600,7 @@ kernel : func : npu_identity interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : numel args : (Tensor x) @@ -3546,11 +3632,13 @@ args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) output : Tensor(out) invoke : full(shape, 1, dtype, place) + traits : paddle::dialect::ForwardOnlyTrait - op : ones_like args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={}) output : Tensor(out) invoke : full_like(x, 1, dtype, place) + traits : paddle::dialect::ForwardOnlyTrait - op : overlap_add args: (Tensor x, int hop_length, int axis=-1) @@ -3736,6 +3824,7 @@ kernel : func : prune_gate_by_capacity data_type : gate_idx + traits : paddle::dialect::ForwardOnlyTrait - op : psroi_pool args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height=1, int pooled_width=1, int output_channels=1, float spatial_scale=1.0) @@ -3798,7 +3887,7 @@ data_type : param optional : master_param, master_param_out inplace : (param -> param_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (rho -> rho_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (master_param->master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : randint args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={}) @@ -3823,7 +3912,7 @@ func : random_routing data_type : prob inplace : (topk_idx -> out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : randperm args : (int n, DataType dtype, Place place={}) @@ -3836,7 +3925,7 @@ param : [n, dtype] data_type : dtype backend : place - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait interfaces : paddle::dialect::InferSymbolicShapeInterface - op : rank_attention @@ -3863,6 +3952,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : real args : (Tensor x) @@ -3906,6 +3996,7 @@ kernel : func : reduce_scatter param: [x, nranks] + traits : paddle::dialect::ForwardOnlyTrait - op : reindex_graph args : (Tensor x, Tensor neighbors, Tensor count, Tensor hashtable_value, Tensor hashtable_index) @@ -3916,6 +4007,7 @@ func : graph_reindex data_type : x optional : hashtable_value, hashtable_index + traits : paddle::dialect::ForwardOnlyTrait # interfaces : paddle::dialect::InferSymbolicShapeInterface - op : relu @@ -4025,7 +4117,7 @@ data_type : param optional : mean_grad, master_param, master_param_outs inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out), (master_param->master_param_outs) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : rnn args: (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob=0.0, bool is_bidirec=false, int input_size=10, int hidden_size=100, int num_layers=1, str mode="RNN_TANH", int seed=0, bool is_test=false) @@ -4103,7 +4195,7 @@ support_trans_dtype : learning_rate optional : master_param, master_param_out inplace : (param -> param_out), (prev -> prev_out), (learning_rate -> learning_rate_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : rrelu args : (Tensor x, float lower=1.0f/8, float upper=1.0f/3, bool is_test=false) @@ -4257,6 +4349,7 @@ func: sequence_mask_scalar data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : sequence_pool args: (Tensor x, bool is_test=false, str pooltype = "AVERAGE", float pad_value = 0.0) @@ -4296,7 +4389,7 @@ support_trans_dtype : learning_rate optional : master_param, master_param_out inplace : (param -> param_out), (master_param -> master_param_out) - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait - op : shape args : (Tensor input) @@ -4319,6 +4412,7 @@ kernel : func : shard_index interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : share_data args: (Tensor x) @@ -4329,6 +4423,7 @@ kernel: func: share_data {dense -> dense} share_data_sr {selected_rows -> selected_rows} + traits : paddle::dialect::ForwardOnlyTrait - op : shuffle_batch args : (Tensor x, Tensor seed, int startup_seed=0) @@ -4601,6 +4696,7 @@ kernel : func : standard_gamma interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : stanh args : (Tensor x, float scale_a=0.67f, float scale_b=1.7159f) @@ -4701,6 +4797,7 @@ kernel : func : sync_calc_stream inplace : (x -> out) + traits : paddle::dialect::ForwardOnlyTrait - op : take_along_axis args : (Tensor arr, Tensor indices, int axis) @@ -4755,6 +4852,7 @@ func: tdm_child data_type: x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : tdm_sampler args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2) @@ -4766,6 +4864,7 @@ data_type : x optional : labels interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : temporal_shift args : (Tensor x, int seg_num, float shift_ratio = 0.25f, str data_format = "NCHW") @@ -4812,6 +4911,7 @@ data_type : x optional : threshold, topp_seed, topk_scores, topk_ids interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : topk args : (Tensor x, Scalar(int) k = 1, int axis = -1, bool largest = true, bool sorted = true) @@ -4890,6 +4990,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : trilinear_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_format="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) @@ -4929,6 +5030,7 @@ data_type : dtype backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : trunc args : (Tensor input) @@ -4953,7 +5055,7 @@ param : [shape, mean, std, seed, a, b, dtype] backend : place data_type : dtype - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait interfaces : paddle::dialect::InferSymbolicShapeInterface - op : unbind @@ -5015,7 +5117,7 @@ uniform_random_batch_size_like_sr {selected_rows -> selected_rows} data_type: dtype no_need_buffer: input - traits : pir::SideEffectTrait + traits : pir::SideEffectTrait, paddle::dialect::ForwardOnlyTrait # interfaces : paddle::dialect::InferSymbolicShapeInterface - op : unique_consecutive @@ -5028,6 +5130,7 @@ data_type : x optional : index, counts interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : unpool args: (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, IntArray output_size, str data_format) @@ -5089,6 +5192,7 @@ skip_transform : found_infinite inplace : (x -> out), (prev_loss_scaling -> loss_scaling), (in_good_steps -> out_good_steps), (in_bad_steps -> out_bad_steps) interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : view_dtype args : (Tensor input, DataType dtype) @@ -5124,6 +5228,7 @@ func : viterbi_decode data_type : potentials interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : warpctc args : (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank = 0, bool norm_by_times = false) @@ -5158,6 +5263,7 @@ kernel : func : weight_dequantize data_type : out_dtype + traits : paddle::dialect::ForwardOnlyTrait # interfaces : paddle::dialect::InferSymbolicShapeInterface - op : weight_only_linear @@ -5181,6 +5287,7 @@ func : weight_quantize data_type : x backend : x + traits : paddle::dialect::ForwardOnlyTrait # interfaces : paddle::dialect::InferSymbolicShapeInterface - op : weighted_sample_neighbors @@ -5192,6 +5299,7 @@ func : weighted_sample_neighbors optional : eids interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : where args : (Tensor condition, Tensor x, Tensor y) @@ -5214,6 +5322,7 @@ func : yolo_box data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : yolo_box_head args : (Tensor x, int[] anchors, int class_num) @@ -5224,6 +5333,7 @@ func : yolo_box_head data_type : x interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : yolo_box_post args : (Tensor boxes0, Tensor boxes1, Tensor boxes2, Tensor image_shape, Tensor image_scale, int[] anchors0, int[] anchors1, int[] anchors2, int class_num, float conf_thresh, int downsample_ratio0, int downsample_ratio1, int downsample_ratio2, bool clip_bbox, float scale_x_y, float nms_threshold) @@ -5234,6 +5344,7 @@ func : yolo_box_post data_type : boxes0 interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op : yolo_loss args : (Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors={}, int[] anchor_mask={}, int class_num =1 , float ignore_thresh=0.7, int downsample_ratio=32, bool use_label_smooth=true, float scale_x_y=1.0) @@ -5270,6 +5381,7 @@ data_type: DataType::FLOAT32 optional: seq_length interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait - op: number_count args: (Tensor numbers, int upper_range) @@ -5280,3 +5392,4 @@ func: number_count data_type: numbers interfaces : paddle::dialect::InferSymbolicShapeInterface + traits : paddle::dialect::ForwardOnlyTrait From 3dc7ce0514b5efeb1ca805e28c84bcd2a07b95e3 Mon Sep 17 00:00:00 2001 From: co63oc Date: Sat, 12 Oct 2024 16:08:55 +0800 Subject: [PATCH 091/135] Move profiler test to test directory [fluid_ops] (#68634) --- paddle/fluid/platform/profiler/CMakeLists.txt | 16 ---------------- test/cpp/fluid/platform/profiler/CMakeLists.txt | 17 ++++++++++++++++- .../profiler/dump/test_serialization_logger.cc | 0 .../fluid/platform/profiler/profiler_test.cc | 0 .../fluid/platform/profiler/test_event_node.cc | 0 .../fluid/platform/profiler/test_extra_info.cc | 0 6 files changed, 16 insertions(+), 17 deletions(-) rename {paddle => test/cpp}/fluid/platform/profiler/dump/test_serialization_logger.cc (100%) rename {paddle => test/cpp}/fluid/platform/profiler/profiler_test.cc (100%) rename {paddle => test/cpp}/fluid/platform/profiler/test_event_node.cc (100%) rename {paddle => test/cpp}/fluid/platform/profiler/test_extra_info.cc (100%) diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index fcc96af30289a..c443213aa0554 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -36,19 +36,3 @@ cc_library( glog event_bind custom_tracer) -cc_test( - test_event_node - SRCS test_event_node.cc - DEPS event_node profiler_logger) -cc_test( - test_extra_info - SRCS test_extra_info.cc - DEPS phi glog common) -cc_test( - test_serialization_logger - SRCS dump/test_serialization_logger.cc - DEPS event_bind) -cc_test( - new_profiler_test - SRCS profiler_test.cc - DEPS new_profiler) diff --git a/test/cpp/fluid/platform/profiler/CMakeLists.txt b/test/cpp/fluid/platform/profiler/CMakeLists.txt index 0a95e9a292a4d..19e4c3b892db5 100644 --- a/test/cpp/fluid/platform/profiler/CMakeLists.txt +++ b/test/cpp/fluid/platform/profiler/CMakeLists.txt @@ -1 +1,16 @@ -# Note(Liyulingyue): create an empty cmake file to avoid conflict +cc_test( + test_event_node + SRCS test_event_node.cc + DEPS event_node profiler_logger) +cc_test( + test_extra_info + SRCS test_extra_info.cc + DEPS phi glog common) +cc_test( + test_serialization_logger + SRCS dump/test_serialization_logger.cc + DEPS event_bind) +cc_test( + new_profiler_test + SRCS profiler_test.cc + DEPS new_profiler) diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/test/cpp/fluid/platform/profiler/dump/test_serialization_logger.cc similarity index 100% rename from paddle/fluid/platform/profiler/dump/test_serialization_logger.cc rename to test/cpp/fluid/platform/profiler/dump/test_serialization_logger.cc diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/test/cpp/fluid/platform/profiler/profiler_test.cc similarity index 100% rename from paddle/fluid/platform/profiler/profiler_test.cc rename to test/cpp/fluid/platform/profiler/profiler_test.cc diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/test/cpp/fluid/platform/profiler/test_event_node.cc similarity index 100% rename from paddle/fluid/platform/profiler/test_event_node.cc rename to test/cpp/fluid/platform/profiler/test_event_node.cc diff --git a/paddle/fluid/platform/profiler/test_extra_info.cc b/test/cpp/fluid/platform/profiler/test_extra_info.cc similarity index 100% rename from paddle/fluid/platform/profiler/test_extra_info.cc rename to test/cpp/fluid/platform/profiler/test_extra_info.cc From 57587d4d476f29376ee076692a5267b07ab88efa Mon Sep 17 00:00:00 2001 From: Chang Lu <55493212+AndSonder@users.noreply.github.com> Date: Sat, 12 Oct 2024 16:21:44 +0800 Subject: [PATCH 092/135] [AutoParallel][PIR] Fix 1F1B and VPP hang (#68141) * add enable send recv * Update pir.cc * Update pass_utils.py * fix hang * remove useless codes * fix * remove useless codes * remove useless codes * fix vpp hang --- .../pir/transforms/pd_op_to_kernel_pass.cc | 51 +++++++++++++++++++ .../auto_parallel/static/engine.py | 1 - .../auto_parallel/static/pir_pass.py | 2 +- .../reshard_funcs/same_status_reshard_func.py | 14 +++-- .../paddle/distributed/passes/pass_utils.py | 30 ++++++++++- 5 files changed, 90 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc index 95635f2543283..24e1cd7aa6c78 100644 --- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc @@ -3058,6 +3058,57 @@ void RemoveRedundantMemcpyAfterShadowFeed(pir::Block* block, VLOG(6) << *it; } } + + pir::Value shadow_source = it->operand_source(0); + auto val_src_place = + shadow_source.type() + .dyn_cast() + .place(); + + if (shadow_value.use_count() >= 1 || val_src_place == phi::CPUPlace()) { + bool all_use_is_scalar = true; + for (auto use_it = shadow_value.use_begin(); + use_it != shadow_value.use_end(); + ++use_it) { + auto use_op = use_it->owner(); + + if (!use_op->isa()) { + all_use_is_scalar = false; + break; + } + + auto op_info = ctx->GetRegisteredOpInfo( + use_op->dyn_cast().op_name()); + + if (!op_info) { + all_use_is_scalar = false; + break; + } + + auto* op_info_concept = + op_info.GetInterfaceImpl(); + auto [input_infos, _1, _2, _3, _4] = + op_info_concept->get_op_info_(op_info.name()); + + uint32_t val_index = 0; + for (uint32_t index = 0; index < use_op->num_operands(); index++) { + if (use_op->operand_source(index) == shadow_value) { + val_index = index; + break; + } + } + + if (!input_infos[val_index].is_mutable_attribute) { + all_use_is_scalar = false; + break; + } + } + if (all_use_is_scalar) { + // set dst_place_type for shadow_feed, 0 for cpu_place + VLOG(6) << "Reset shadow_feed dst_place_type to 0 for scalar use"; + it->set_attribute("dst_place_type", pir::Int32Attribute::get(ctx, 0)); + } + } } } } diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index d9998e7dee00c..dffd5266b052c 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -1294,7 +1294,6 @@ def _init_comm(self): all_process_groups = get_all_process_groups() for process_group in all_process_groups: process_group.instantiate() - pass return # Traverse different rank programs and traverse each op of them, diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py index be3703bc69ee6..9d9f91e0fe67d 100644 --- a/python/paddle/distributed/auto_parallel/static/pir_pass.py +++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py @@ -162,7 +162,7 @@ def apply_partition_pass(program): assign_out.get_defining_op().dist_attr = ( copy_op_attr_with_new_member( assign_out.get_defining_op().dist_attr, - new_chunk_id=prev_op.dist_attr.chunk_id, + new_chunk_id=op.dist_attr.chunk_id, ) ) diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py index 32aa00bd84f8e..9705bc1887987 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py @@ -45,20 +45,24 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): all_process_ids = sorted(all_process_ids) cur_global_rank = paddle.distributed.get_rank() - comm_group = new_process_group(all_process_ids) + + for src, dst in zip(src_mesh.process_ids, dst_mesh.process_ids): + if src != dst: + new_process_group([src, dst], group_type="p2p") + new_process_group([dst, src], group_type="p2p") is_send = True for src, dst in zip(src_mesh.process_ids, dst_mesh.process_ids): if src == cur_global_rank: - dst_local_rank = all_process_ids.index(dst) chunk_id = -1 if src_value.get_defining_op().dist_attr: chunk_id = src_value.get_defining_op().dist_attr.chunk_id + comm_group = new_process_group([src, dst], group_type="p2p") paddle._C_ops.send_v2( src_value, comm_group.id, - dst_local_rank, + comm_group.ranks.index(dst), True, False, ) @@ -81,15 +85,15 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): if var.dist_attr().process_mesh == dst_mesh: chunk_id = find_var_used_op_chunk_id(var) - src_local_rank = all_process_ids.index(src) assert ( -1 not in dst_type.shape ), "dynamic shape is not supported by pir-auto parallel yet." + comm_group = new_process_group([src, dst], group_type="p2p") recv_value = paddle._C_ops.recv_v2( dst_type._local_shape, dst_type.dtype, - src_local_rank, + comm_group.ranks.index(src), comm_group.id, True, False, diff --git a/python/paddle/distributed/passes/pass_utils.py b/python/paddle/distributed/passes/pass_utils.py index 98fd33855a7be..8561a6e19da58 100644 --- a/python/paddle/distributed/passes/pass_utils.py +++ b/python/paddle/distributed/passes/pass_utils.py @@ -466,6 +466,29 @@ def _create_program(src_block, dst_block, src_op, force_create=False): _create_var(src_block, dst_block, output_varname, force_create) +def _pir_overlap_send_recv(program): + """ + This function is used to replace the function '_insert_sync_for_fthenb_1f1b'. + The finally target of this function is as follows: + 1. no need to insert the 'c_sync_calc' and 'c_sync_calc' operators + 2. 'send_v2' operator uses 'dist_attr.execution_stream' to set stream of its own. + 3. 'recv_v2' operator uses 'dist_attr.execution_stream' to set stream of its own. + """ + for block in program.blocks: + for op in block.ops: + if op.name() == "pd_op.send_v2": + op.set_bool_attr("dynamic_shape", False) + op.set_bool_attr("use_calc_stream", True) + ring_id = op.attrs()["ring_id"] + op.set_execution_stream(f"send_stream_{ring_id}") + op.set_scheduling_priority(0) + elif op.name() == "pd_op.recv_v2": + op.set_bool_attr("dynamic_shape", False) + op.set_bool_attr("use_calc_stream", True) + op.set_execution_stream("recv_stream") + op.set_scheduling_priority(0) + + def _insert_sync_for_fthenb_1f1b(program, dist_context=None): """ This implementation refers to lots of Paddle/python/paddle/base/optimizer.py. @@ -805,6 +828,8 @@ def find_var_used_op_chunk_id(var): def _split_program_into_forward_backward_optimize( main_program, enable_send_recv_overlap=False ): + _pir_overlap_send_recv(main_program) + forward_complete_op_role(main_program) complete_ops = main_program.global_block().ops @@ -837,22 +862,25 @@ def _split_program_into_forward_backward_optimize( # if this op's output is used, create the persistable # var to be used in other programs. result_in_opt = opt_ops[op_idx].result(idx) + if result_in_opt.use_empty() is False: name = f"var_{op_idx}_{complete_ops[op_idx].name()}_{idx}" paddle.pir.set_insertion_point_after(bwd_ops[op_idx]) paddle._C_ops.set_persistable_value( bwd_ops[op_idx].result(idx), name ) - # bwd_ops[op_idx].result(idx).persistable = True + new_result_var_in_opt = opt_block.add_kwarg( name, result_in_opt.type() ) new_result_var_in_opt.persistable = ( result_in_opt.persistable ) + opt_ops[op_idx].result(idx).replace_all_uses_with( new_result_var_in_opt ) + opt_ops[op_idx].erase() else: # in backward program, only the forward ops should be removed From 88d4de6acb0cb117872e460812812e5a7e2c85ed Mon Sep 17 00:00:00 2001 From: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com> Date: Sat, 12 Oct 2024 16:33:18 +0800 Subject: [PATCH 093/135] [Auto Parallel] fix enable_delay_scale_loss for static auto parallel && fix sharding degree (#68525) --- .../paddle/distributed/auto_parallel/api.py | 22 +++- .../passes/auto_parallel_gradient_merge.py | 105 +++++++++++++++--- 2 files changed, 109 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py index f39cb1ab34675..d64bf66c51192 100644 --- a/python/paddle/distributed/auto_parallel/api.py +++ b/python/paddle/distributed/auto_parallel/api.py @@ -31,6 +31,7 @@ EagerParamBase, Variable, default_main_program, + in_dygraph_mode, in_pir_mode, use_pir_api, ) @@ -1000,7 +1001,7 @@ def get_placement_with_sharding(param, sharding_mesh_axis): class _ShardOptimizer(Optimizer): - def __init__(self, optimizer, shard_fn=None): + def __init__(self, optimizer, shard_fn=None, gradient_accumulation_steps=1): assert ( optimizer is not None ), "The argument `optimizer` cannot be empty." @@ -1025,6 +1026,7 @@ def __init__(self, optimizer, shard_fn=None): self._shard_fn = shard_fn self._sharding_mesh_axis = None self._sharding_degree = None + self.gradient_accumulation_steps = gradient_accumulation_steps if isinstance( self._shard_fn, (ShardingStage1, ShardingStage2, ShardingStage3) @@ -1246,6 +1248,21 @@ def state_dict(self): return self._inner_opt.state_dict() def _append_optimize_op(self, block, param_and_grad): + if ( + in_auto_parallel_align_mode() # In align mode, we use enable_delay_scale_loss by default + and in_dygraph_mode() + and param_and_grad[1].is_dist() + ): + placements = param_and_grad[1].placements + meshs = param_and_grad[1].process_mesh + grad = param_and_grad[1] + + for i in range(len(placements) - 1, -1, -1): + if isinstance(placements[i], dist.Partial): + placements[i] = dist.Replicate() + grad = dist.reshard(grad, meshs, placements) + grad /= self.gradient_accumulation_steps + param_and_grad = (param_and_grad[0], grad) return self._inner_opt._append_optimize_op(block, param_and_grad) def __getattr__(self, item): @@ -1596,6 +1613,7 @@ def __call__(self, key: str, param: Tensor, accumulator: Tensor) -> Tensor: def shard_optimizer( optimizer: Optimizer, shard_fn: Callable[[str, Tensor, Tensor], Tensor] | None = None, + gradient_accumulation_steps: int = 1, ) -> _ShardOptimizer: """ @@ -1640,7 +1658,7 @@ def shard_fn(accumulator_name, param, accumulator) -> sharded_accumulator >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py """ - return _ShardOptimizer(optimizer, shard_fn) + return _ShardOptimizer(optimizer, shard_fn, gradient_accumulation_steps) def shard_scaler(scaler: GradScaler) -> GradScaler: diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py index f03a05789ae30..524832bcd1895 100644 --- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py +++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py @@ -636,6 +636,94 @@ def parse_program( return grad_to_gradient_merge +def _find_trival_optimizer_ops(block): + optimizer_ops = [] + for op in block.ops: + if "adam" in op.name() or "sgd" in op.name(): + optimizer_ops.append(op) + return optimizer_ops + + +def _get_prev_op(block, optimizer_op): + found = False + for op in reversed(block.ops): + if found: + return op + if op.id == optimizer_op.id: + found = True + return None + + +def _insert_scale_op_after(target_value, optimizer_op, scale, bias=0.0): + scaled_grad = paddle._C_ops.scale_(target_value, scale, bias, False) + + scale_op = scaled_grad.get_defining_op() + scale_op.op_role = int(OpRole.Optimize) + + full_op = scale_op.operand_source(1).get_defining_op() + assert ( + full_op.name() == "pd_op.full" + ), f"The defining op of the scale value should be `pd_op.full`, but got {full_op.name()}" + full_op.op_role = int(OpRole.Optimize) + + if "adam" in optimizer_op.name(): + optimizer_op.operand(1).set_source(scaled_grad) + elif "sgd" in optimizer_op.name(): + optimizer_op.operand(2).set_source(scaled_grad) + + +def _append_scale_op_before_comm(block, new_params_to_grads, k_steps): + for op in reversed(block.ops): + if op.op_role == int(OpRole.Backward): + paddle.pir.set_insertion_point_after(op) + break + for _, new_grad in new_params_to_grads: + new_grad = paddle._C_ops.scale_(new_grad, 1.0 / k_steps, 0.0, False) + + scale_op = new_grad.get_defining_op() + scale_op.op_role = int(OpRole.Optimize) + + full_op = scale_op.operand_source(1).get_defining_op() + assert ( + full_op.name() == "pd_op.full" + ), f"The defining op of the scale value should be `pd_op.full`, but got {full_op.name()}" + full_op.op_role = int(OpRole.Optimize) + paddle.pir.set_insertion_point_to_block_end(block) + + +def _append_scale_op_after_comm(block, optimizer_ops, k_steps): + for optimizer_op in optimizer_ops: + target_value = None + if "adam" in optimizer_op.name(): # adam and adamw are included + target_value = optimizer_op.operand_source(1) + elif "sgd" in optimizer_op.name(): + target_value = optimizer_op.operand_source(2) + else: + raise NotImplementedError( + f"We yet support adamw, adam and sgd, but got {optimizer_op.name()}" + ) + assert ( + target_value is not None + ), "target_value is not expected to be None" + insertion_point = target_value.get_defining_op() + if insertion_point is None: + # target_value is a gradient_merge_var, which hasn't defining_op + # so we find the prev op of optimizer_op, inserting a scale op behind. + insertion_point = _get_prev_op(block, optimizer_op) + paddle.pir.set_insertion_point_after(insertion_point) + _insert_scale_op_after(target_value, optimizer_op, 1.0 / k_steps) + paddle.pir.set_insertion_point_to_block_end(block) + + +def _pir_append_scale_op(program, new_params_to_grads, k_steps): + block = program.global_block() + optimizer_ops = _find_trival_optimizer_ops(block) + if len(optimizer_ops) > 0: + _append_scale_op_after_comm(block, optimizer_ops, k_steps) + else: + _append_scale_op_before_comm(block, new_params_to_grads, k_steps) + + def _pir_parse_program( main_program, startup_program, @@ -657,22 +745,7 @@ def _pir_parse_program( # step3: append scale op if avg: - main_block = main_program.global_block() - for op in reversed(main_block.ops): - if op.op_role == int(OpRole.Backward): - paddle.pir.set_insertion_point_after(op) - break - for _, new_grad in new_params_to_grads: - new_grad = paddle._C_ops.scale_(new_grad, 1.0 / k_steps, 0.0, False) - - scale_op = new_grad.get_defining_op() - scale_op.op_role = int(OpRole.Optimize) - - full_op = scale_op.operand_source(1).get_defining_op() - assert ( - full_op.name() == "pd_op.full" - ), f"The defining op of the scale value should be `pd_op.full`, but got {full_op.name()}" - full_op.op_role = int(OpRole.Optimize) + _pir_append_scale_op(main_program, new_params_to_grads, k_steps) @register_pass("auto_parallel_gradient_merge_pass") From dbf1600b7760f28d89e67b12a2658a380dd156d3 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Sat, 12 Oct 2024 17:01:27 +0800 Subject: [PATCH 094/135] Add xpu dockerfile;test=document_fix (#68646) --- tools/dockerfile/Dockerfile.develop.xpu | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tools/dockerfile/Dockerfile.develop.xpu diff --git a/tools/dockerfile/Dockerfile.develop.xpu b/tools/dockerfile/Dockerfile.develop.xpu new file mode 100644 index 0000000000000..bfa40d76daa7e --- /dev/null +++ b/tools/dockerfile/Dockerfile.develop.xpu @@ -0,0 +1,32 @@ +#Docker Image for PaddlePaddle Kunlun XPU + +FROM registry.baidubce.com / device / paddle - cpu : ubuntu20 - x86_64 - gcc84 - + py310 LABEL maintainer = + "PaddlePaddle Authors " + + ARG XRE_VERSION = 4.31.0 ARG XRE_INSTALL = / usr / local / xpu - + ${XRE_VERSION} + + WORKDIR / + opt RUN wget - + q https + : //klx-sdk-release-public.su.bcebos.com/xre/release/${XRE_VERSION}.1/xre-ubuntu_2004_x86_64.tar.gz && \ + tar -zxf xre-ubuntu_2004_x86_64.tar.gz && \ + mkdir -p ${XRE_INSTALL} && \ + cp -af /opt/xre-ubuntu_2004_x86_64/bin/ ${XRE_INSTALL}/ && \ + cp -af /opt/xre-ubuntu_2004_x86_64/include/ ${XRE_INSTALL}/ && \ + cp -af /opt/xre-ubuntu_2004_x86_64/tools/ ${XRE_INSTALL}/ && \ + cp -af /opt/xre-ubuntu_2004_x86_64/version.txt ${XRE_INSTALL}/ && \ + mkdir -p ${XRE_INSTALL}/lib64 && \ + cp -af /opt/xre-ubuntu_2004_x86_64/lib/* ${XRE_INSTALL}/lib64/ && \ + cp -af /opt/xre-ubuntu_2004_x86_64/so/* ${XRE_INSTALL}/lib64/ && \ + ln -sf ${XRE_INSTALL} /usr/local/xpu && \ + ln -sf ${XRE_INSTALL}/bin/xpu_smi /usr/local/bin/xpu_smi && \ + rm -rf xre-ubuntu_2004_x86_64.tar.gz && rm -rf xre-ubuntu_2004_x86_64/ + ENV PATH = ${XRE_INSTALL} / bin + : $PATH + +#upgrade pip + RUN pip install-- upgrade pip setuptools wheel RUN pip cache purge + + EXPOSE 22 From f1eaa55d7b98886a601a622ab950944338753b7d Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:36:25 +0800 Subject: [PATCH 095/135] fix (#68623) --- .../auto_parallel/static/pir_pass.py | 61 ++++++++++++------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py index 9d9f91e0fe67d..a0bbd90bd6612 100644 --- a/python/paddle/distributed/auto_parallel/static/pir_pass.py +++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py @@ -1444,30 +1444,45 @@ def fuse_attention_ffn_qkv_pass( dy_param.local_shape[-1] // dy_param.local_head_dims ) concated_dy_param_index.append(param_index) - # Fuse params and init pir program fusion params. - with paddle.base.dygraph.guard(): - if len(dy_param_list) == 3: - is_qkv = True - num_heads = dy_param_list[0].local_num_head - num_key_value_heads = dy_param_list[1].local_num_head - else: - is_qkv = False - num_heads = None - num_key_value_heads = None - concated_param = fuse_param_func( - [obj._local_value() for obj in concated_dy_param_list], - is_qkv=is_qkv, - num_heads=num_heads, - num_key_value_heads=num_key_value_heads, - ) - pir_scope_param = ( - paddle.static.global_scope().var(pir_param).get_tensor() - ) - pir_scope_param._share_data_with(concated_param.get_tensor()) - # Pop and relase original params from concrete_program - for param in concated_dy_param_list: - param.get_tensor()._clear() + dy_param_init = True + for p in concated_dy_param_list: + if not p._local_value()._is_initialized(): + dy_param_init = False + break + + if dy_param_init: + # Fuse params and init pir program fusion params. + with paddle.base.dygraph.guard(): + if len(dy_param_list) == 3: + is_qkv = True + num_heads = dy_param_list[0].local_num_head + num_key_value_heads = dy_param_list[ + 1 + ].local_num_head + else: + is_qkv = False + num_heads = None + num_key_value_heads = None + concated_param = fuse_param_func( + [ + obj._local_value() + for obj in concated_dy_param_list + ], + is_qkv=is_qkv, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + ) + + pir_scope_param = ( + paddle.static.global_scope().var(pir_param).get_tensor() + ) + pir_scope_param._share_data_with( + concated_param.get_tensor() + ) + # Pop and relase original params from concrete_program + for param in concated_dy_param_list: + param.get_tensor()._clear() concated_dy_param_index.sort(reverse=True) for index in concated_dy_param_index: concrete_program.parameters[0].pop(index) From 059b646926d0e7698502597bb20fb08816712c88 Mon Sep 17 00:00:00 2001 From: ooo oo <106524776+ooooo-create@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:48:09 +0800 Subject: [PATCH 096/135] [CINN] Refine op_test for infer symbol shape (#68607) * fix infer_and_compare_symbol func * fix slogdet and mean op, refine logic for IntArray * fix more test --- .../infer_symbolic_shape/binary_infer_sym.cc | 18 ++++----- .../same_operands_result.cc | 1 - .../same_operands_result.h | 1 - .../infer_symbolic_shape/unary_infer_sym.cc | 40 ++++++++++++++++++- .../infer_symbolic_shape/unary_infer_sym.h | 1 + test/legacy_test/op_test.py | 19 ++++++++- test/legacy_test/test_bicubic_interp_v2_op.py | 4 +- .../legacy_test/test_bilinear_interp_v2_op.py | 6 ++- test/legacy_test/test_nearest_interp_v2_op.py | 4 +- 9 files changed, 76 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc index 67d687fadd59a..9e069abf557e9 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/binary_infer_sym.cc @@ -2054,20 +2054,20 @@ bool IndexSelectOpInferSymbolicShape( std::vector x_shape = x_shape_or_data.shape(); std::vector index_shape = index_shape_or_data.shape(); - int64_t dim = op->attribute("dim").data(); + int64_t axis = op->attribute("axis").data(); auto input_rank = x_shape.size(); auto index_rank = index_shape.size(); PADDLE_ENFORCE_EQ( - dim < static_cast(input_rank) && - dim >= (0 - static_cast(input_rank)), + axis < static_cast(input_rank) && + axis >= (0 - static_cast(input_rank)), true, common::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + "Attr(axis) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis) = %d.", input_rank, input_rank - 1, - dim)); + axis)); PADDLE_ENFORCE_EQ(index_rank == 1 || index_rank == 2, true, @@ -2082,12 +2082,12 @@ bool IndexSelectOpInferSymbolicShape( if (index_rank == 2) infer_context->AddEqualCstr(index_shape[1], symbol::DimExpr{1}); - if (dim < 0) { - dim += input_rank; + if (axis < 0) { + axis += input_rank; } std::vector output_shape = x_shape; - output_shape[dim] = index_shape[0]; + output_shape[axis] = index_shape[0]; infer_context->SetShapeOrDataForValue( op->result(0), diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc index 0a5fd089731df..b4acd181269b1 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc @@ -160,7 +160,6 @@ OP_SAME_OPERANDS_AND_RESULT(Sin) OP_SAME_OPERANDS_AND_RESULT(Sin_) OP_SAME_OPERANDS_AND_RESULT(Sinh) OP_SAME_OPERANDS_AND_RESULT(Sinh_) -OP_SAME_OPERANDS_AND_RESULT(Slogdet) OP_SAME_OPERANDS_AND_RESULT(Softmax) OP_SAME_OPERANDS_AND_RESULT(Softmax_) OP_SAME_OPERANDS_AND_RESULT(Softplus) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h index f0c714cb9fdc1..58612d73c3203 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.h @@ -152,7 +152,6 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sin) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sin_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sinh) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sinh_) -OP_DECLARE_INFER_SYMBOLIC_SHAPE(Slogdet) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Softmax) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Softmax_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Softplus) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc index 33eae43d636c4..21ad54ab5a384 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc @@ -2004,8 +2004,19 @@ bool MeanOpInferSymbolicShape(pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { bool keepdim = GetBoolAttr(op, "keepdim"); std::vector axis; - if (paddle::dialect::details::GetAxisFromOpInput( - op->operand_source(1), infer_context, &axis)) { + if (op->num_operands() == 1) { + const auto attributes = op->attributes(); + if (op->attributes().find("axis") != attributes.end()) { + axis = op->attribute("axis") + .data() + .GetData(); + bool reduce_all = axis.size() == 0; + + return details::ReduceInferDim( + op, infer_context, axis, keepdim, reduce_all); + } + } else if (paddle::dialect::details::GetAxisFromOpInput( + op->operand_source(1), infer_context, &axis)) { bool reduce_all = axis.size() == 0; return details::ReduceInferDim( @@ -3135,6 +3146,31 @@ bool SliceOpInferSymbolicShape(pir::Operation *op, return true; } +bool SlogdetOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &x_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + const auto &x_shape = x_shape_or_data.shape(); + size_t x_shape_size = x_shape.size(); + PADDLE_ENFORCE_GE( + x_shape_size, + 2, + common::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + infer_context->AddEqualCstr(x_shape[x_shape_size - 1], + x_shape[x_shape_size - 2]); + std::vector out_shape = {2}; + size_t addtional_dims = x_shape.size() - 2; + for (size_t i = 0; i < addtional_dims; i++) { + out_shape.push_back(x_shape[i]); + } + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(out_shape)}); + return true; +} + bool SplitOpInferSymbolicShape(pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { // input diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h index 290a3b7e48801..55d0267da5cf0 100755 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h @@ -128,6 +128,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(ShapeSr) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(ShardIndex) OP_DECLARE_INFER_SYMBOLIC_SHAPE(ShuffleChannel) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Slice) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(Slogdet) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Split) OP_DECLARE_INFER_SYMBOLIC_SHAPE(SplitWithNum) OP_DECLARE_INFER_SYMBOLIC_SHAPE(SquaredL2Norm) diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py index 3ed41c504963b..3444f7a47312a 100644 --- a/test/legacy_test/op_test.py +++ b/test/legacy_test/op_test.py @@ -1653,7 +1653,8 @@ def _infer_and_compare_symbol(self, place): kernel_sig = self.get_kernel_signature(place) program = paddle.static.Program() with paddle.static.program_guard(program): - with scope_guard(Scope()): + scope = Scope() + with scope_guard(scope): # prepare inps attributes feed ( static_inputs, @@ -1708,8 +1709,22 @@ def _infer_and_compare_symbol(self, place): # executor run executor = Executor(place) outs = executor.run(program, feed=feed, fetch_list=[fetch_list]) + # get fetch program + fetch_list = executor._check_fetch_list([fetch_list]) + fetch_program, _, _ = ( + executor._executor_cache.get_pir_program_and_executor( + program=program, + feed=feed, + fetch_list=fetch_list, + feed_var_name='feed', + fetch_var_name='fetch', + place=place, + scope=scope, + plan=None, + ) + ) - self._compare_symbol(program, outs) + self._compare_symbol(fetch_program, outs) def _compare_expect_and_actual_outputs( self, place, fetch_list, expect_outs, actual_outs, inplace_atol=None diff --git a/test/legacy_test/test_bicubic_interp_v2_op.py b/test/legacy_test/test_bicubic_interp_v2_op.py index 4d418789c30bd..4a368762055fa 100644 --- a/test/legacy_test/test_bicubic_interp_v2_op.py +++ b/test/legacy_test/test_bicubic_interp_v2_op.py @@ -484,7 +484,9 @@ def setUp(self): self.outputs = {'Out': convert_float_to_uint16(output_np)} def test_check_output(self): - self.check_output(check_pir=True) + self.check_output( + check_pir=True, check_symbol_infer=(self.out_size is None) + ) def test_check_grad(self): self.check_grad(['X'], 'Out', in_place=True, check_pir=True) diff --git a/test/legacy_test/test_bilinear_interp_v2_op.py b/test/legacy_test/test_bilinear_interp_v2_op.py index 79f29b159c864..15adc49e878ba 100755 --- a/test/legacy_test/test_bilinear_interp_v2_op.py +++ b/test/legacy_test/test_bilinear_interp_v2_op.py @@ -388,7 +388,11 @@ def init_test_case(self): class TestBilinearInterpOpFP16(TestBilinearInterpOp): def test_check_output(self): - self.check_output(atol=1e-3, check_pir=True) + self.check_output( + atol=1e-3, + check_pir=True, + check_symbol_infer=(self.out_size is None), + ) def test_check_grad(self): self.check_grad( diff --git a/test/legacy_test/test_nearest_interp_v2_op.py b/test/legacy_test/test_nearest_interp_v2_op.py index 3266aa45d4b60..5e9a8fa4ea763 100755 --- a/test/legacy_test/test_nearest_interp_v2_op.py +++ b/test/legacy_test/test_nearest_interp_v2_op.py @@ -613,7 +613,9 @@ def setUp(self): self.outputs = {'Out': convert_float_to_uint16(output_np)} def test_check_output(self): - self.check_output(check_pir=True) + self.check_output( + check_pir=True, check_symbol_infer=(self.out_size is None) + ) def test_check_grad(self): self.check_grad(['X'], 'Out', in_place=True, check_pir=True) From 1b1be7d4cd79b0dc9957b7b7194c9b25183bbb43 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sat, 12 Oct 2024 22:40:09 +0800 Subject: [PATCH 097/135] fix slice op infer symbolic bug (#68621) --- .../interface/infer_symbolic_shape/infer_sym_slice_utils.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h index e576fb90c42b5..164f155337f06 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h @@ -85,9 +85,11 @@ inline void CheckAndUpdateSliceAttrs( // following different arrangements. ends.at(i) = IsMaxInt(ends.at(i)) ? in_dims.at(axis) : ends.at(i); + auto out_dim = ends[i] - starts[i]; // If in_dims[axis] or ends[i] have symbol, nedd get Min(in_dims[axis], // ends[i]) - if (!in_dims[axis].isa() || !ends[i].isa()) { + if (!out_dim.isa() && + (!in_dims[axis].isa() || !ends[i].isa())) { symbol::List min_lists{in_dims[axis], ends[i]}; ends.at(i) = symbol::DimExpr({symbol::Min({min_lists})}); } From 9180d644235d387a3538aa77fe5e9deb2b1c55dd Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 14 Oct 2024 09:29:30 +0800 Subject: [PATCH 098/135] Clean recurrent config [fluid_ops] (#68583) * Fix * Fix * ci * ci --- python/paddle/base/backward.py | 2 +- python/paddle/base/framework.py | 7 -- .../auto_parallel/static/cost/comp_op_cost.py | 8 -- python/paddle/distributed/io.py | 5 -- python/paddle/incubate/operators/__init__.py | 1 - python/paddle/incubate/operators/unzip.py | 89 ------------------- test/auto_parallel/test_comp_cost.py | 6 -- .../deprecated/ir/inference/program_config.py | 5 -- test/ir/inference/program_config.py | 5 -- 9 files changed, 1 insertion(+), 127 deletions(-) delete mode 100644 python/paddle/incubate/operators/unzip.py diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py index d2e400276cd91..0c6d1153d5886 100755 --- a/python/paddle/base/backward.py +++ b/python/paddle/base/backward.py @@ -1252,7 +1252,7 @@ def _get_sub_block_path( Args: sub_block(Block): The sub-block in which to get op path. - sub_block_op_desc: The op desc of the sub-block op such as 'while', 'conditional_block' and 'recurrent'. + sub_block_op_desc: The op desc of the sub-block op such as 'while', 'conditional_block'. no_grad_set(set): The set of no grad var name. no_grad_set will be changed. op_path_dict(dict): op_path_dict will be changed. key(int) block index diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index 14dbb58bbd4aa..c583e4e15e7c4 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -3154,7 +3154,6 @@ class Operator: OP_WITHOUT_KERNEL_SET = { "feed", "fetch", - "recurrent", "go", "conditional_block", "pylayer", @@ -3163,7 +3162,6 @@ class Operator: "recv", "listen_and_serv", "fl_listen_and_serv", - "ncclInit", "select", "checkpoint_notify", "gen_bkcl_id", @@ -3173,9 +3171,6 @@ class Operator: "c_comm_init", "c_sync_calc_stream", "c_sync_comm_stream", - "queue_generator", - "dequeue", - "enqueue", "heter_listen_and_serv", "c_wait_comm", "c_wait_compute", @@ -4687,8 +4682,6 @@ def pass_stop_gradient(ins, outs): "conditional_block_grad", "pylayer", "pylayer_grad", - "recurrent", - "recurrent_grad", "while", "while_grad", } diff --git a/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py index eff87fcf71eaf..496d30ec0e4b2 100644 --- a/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py +++ b/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py @@ -455,14 +455,6 @@ def __init__(self, op=None, op_desc=None, cluster=None, rank=None): super().__init__(op=op, op_desc=op_desc, cluster=cluster, rank=rank) -@register_op_cost -class SamplingIdOpCost(CompOpCost): - OP_TYPE = "sampling_id" - - def __init__(self, op=None, op_desc=None, cluster=None, rank=None): - super().__init__(op=op, op_desc=op_desc, cluster=cluster, rank=rank) - - @register_op_cost class ScaleOpCost(CompOpCost): OP_TYPE = "scale" diff --git a/python/paddle/distributed/io.py b/python/paddle/distributed/io.py index f96262fb18ce9..cc7c148cf063e 100644 --- a/python/paddle/distributed/io.py +++ b/python/paddle/distributed/io.py @@ -100,11 +100,6 @@ def __load_persistable_vars(executor, dirname, need_load_vars): attrs={'file_path': os.path.join(dirname, origin_var.name)}, ) - load_block.append_op( - type='delete_var', - inputs={'X': need_delete_vars}, - ) - executor.run(load_prog) if not isinstance(main_program, Program): diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py index 653dc97ed6193..001106af9df43 100644 --- a/python/paddle/incubate/operators/__init__.py +++ b/python/paddle/incubate/operators/__init__.py @@ -21,4 +21,3 @@ from .softmax_mask_fuse_upper_triangle import ( # noqa: F401 softmax_mask_fuse_upper_triangle, ) -from .unzip import unzip # noqa: F401 diff --git a/python/paddle/incubate/operators/unzip.py b/python/paddle/incubate/operators/unzip.py deleted file mode 100644 index a567d144d1636..0000000000000 --- a/python/paddle/incubate/operators/unzip.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -from typing import TYPE_CHECKING - -from paddle.base.data_feeder import check_type, check_variable_and_dtype -from paddle.base.layer_helper import LayerHelper - -if TYPE_CHECKING: - from paddle import Tensor - - -def unzip(input: Tensor, lod: Tensor, len: int) -> Tensor: - r""" - - **unzip layers** - - unzip 'input' according to 'lod' - - Args: - input (Tensor): The zipped input - len(int): The second dim length of unzipped output. - lod (Tensor): The original lod of unzipped input, 1-D LodTensor with shape[K]. - - Returns: - Tensor, The original unzipped tensor, 2-D LodTensor with shape[K-1, len]. - - Examples: - - .. code-block:: python - - >>> # doctest: +REQUIRES(env:GPU) - >>> import numpy as np - >>> import paddle - >>> paddle.set_device('gpu') - >>> input_np = np.array([1, 2, 3, 1, 2, 4]) - >>> lod_np = np.array([0, 3, 3, 3, 4, 6]) - >>> len = 4 - >>> input = paddle.to_tensor(input_np, "int64") - >>> lod = paddle.to_tensor(lod_np, "int64") - >>> unzipped_input = paddle.incubate.operators.unzip(input, lod, len) # type: ignore[operator] - >>> print(unzipped_input) - Tensor(shape=[5, 4], dtype=int64, place=Place(gpu:0), stop_gradient=True, - [[1, 2, 3, 0], - [0, 0, 0, 0], - [0, 0, 0, 0], - [1, 0, 0, 0], - [2, 4, 0, 0]]) - - """ - helper = LayerHelper('unzip', **locals()) - out = helper.create_variable(dtype=input.dtype) - check_variable_and_dtype( - input, - 'input', - [ - 'float16', - 'float32', - 'float64', - 'int', - 'bool', - 'int64', - 'complex64', - 'complex128', - ], - 'unzip', - ) - check_variable_and_dtype(lod, 'lod', ['int', 'int64'], 'unzip') - check_type(len, 'len', (int), 'unzip') - attrs = {'len': len} - helper.append_op( - type='unzip', - inputs={'X': [input], 'lod': [lod]}, - outputs={'Y': [out]}, - attrs=attrs, - ) - return out diff --git a/test/auto_parallel/test_comp_cost.py b/test/auto_parallel/test_comp_cost.py index 7afb077b7e186..c2b3e732a7745 100644 --- a/test/auto_parallel/test_comp_cost.py +++ b/test/auto_parallel/test_comp_cost.py @@ -69,7 +69,6 @@ ReduceSumOpCost, Reshape2GradOpCost, Reshape2OpCost, - SamplingIdOpCost, ScaleOpCost, SliceOpCost, SoftmaxGradOpCost, @@ -335,11 +334,6 @@ def test_comp_cost(self): self.assertTrue(op_cost.time >= 0) self.assertTrue(op_cost.memory >= 0) - op_cost = SamplingIdOpCost(cluster=cluster) - self.assertTrue(op_cost.flops >= 0) - self.assertTrue(op_cost.time >= 0) - self.assertTrue(op_cost.memory >= 0) - op_cost = ScaleOpCost(cluster=cluster) self.assertTrue(op_cost.flops >= 0) self.assertTrue(op_cost.time >= 0) diff --git a/test/deprecated/ir/inference/program_config.py b/test/deprecated/ir/inference/program_config.py index ae49a57f6e086..09b933f8a35bd 100644 --- a/test/deprecated/ir/inference/program_config.py +++ b/test/deprecated/ir/inference/program_config.py @@ -121,7 +121,6 @@ def __repr__(self): _OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', - 'recurrent', 'go', 'conditional_block', 'static_pylayer', @@ -130,7 +129,6 @@ def __repr__(self): 'recv', 'listen_and_serv', 'fl_listen_and_serv', - 'ncclInit', 'select', 'checkpoint_notify', 'gen_bkcl_id', @@ -140,9 +138,6 @@ def __repr__(self): 'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream', - 'queue_generator', - 'dequeue', - 'enqueue', 'heter_listen_and_serv', 'c_wait_comm', 'c_wait_compute', diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py index ae49a57f6e086..09b933f8a35bd 100644 --- a/test/ir/inference/program_config.py +++ b/test/ir/inference/program_config.py @@ -121,7 +121,6 @@ def __repr__(self): _OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', - 'recurrent', 'go', 'conditional_block', 'static_pylayer', @@ -130,7 +129,6 @@ def __repr__(self): 'recv', 'listen_and_serv', 'fl_listen_and_serv', - 'ncclInit', 'select', 'checkpoint_notify', 'gen_bkcl_id', @@ -140,9 +138,6 @@ def __repr__(self): 'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream', - 'queue_generator', - 'dequeue', - 'enqueue', 'heter_listen_and_serv', 'c_wait_comm', 'c_wait_compute', From 60cfe8122efe019bd04ca5d4bd6a4991be5588a1 Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 14 Oct 2024 09:30:24 +0800 Subject: [PATCH 099/135] Clean fluid/operators include [fluid_ops] (#68649) * Fix * Fix * Fix --- paddle/fluid/operators/activation_op.h | 11 - paddle/fluid/operators/batch_norm_op.cc | 68 +-- paddle/fluid/operators/batch_norm_op.h | 3 - .../elementwise/elementwise_mul_op.h | 1 - .../fill_constant_batch_size_like_op.cc | 1 - .../operators/fused/fused_gemm_epilogue_op.cc | 1 - paddle/fluid/operators/fused/multi_gru_op.cc | 3 - paddle/fluid/operators/matmul_op.cc | 526 ------------------ .../operators/optimizers/lars_momentum_op.cc | 1 - paddle/fluid/operators/quantize_linear_op.cc | 7 +- paddle/fluid/operators/reshape_op.cc | 131 ----- 11 files changed, 2 insertions(+), 751 deletions(-) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 25137f9285e09..229c4e22685d9 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -30,8 +30,6 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/activation_functor.h" @@ -61,15 +59,6 @@ struct BaseActivationFunctor { template \ using name##GradGradFunctor = phi::funcs::name##GradGradFunctor; -USE_PHI_FUNCTOR(Tanh) -USE_PHI_FUNCTOR(Relu6) -USE_PHI_FUNCTOR(HardShrink) -USE_PHI_FUNCTOR(ELU) -USE_PHI_FUNCTOR(Sigmoid) -USE_PHI_FUNCTOR(HardSigmoid) -USE_PHI_FUNCTOR(Swish) -USE_PHI_FUNCTOR(HardSwish) -USE_PHI_FUNCTOR(Pow) USE_PHI_FUNCTOR(Mish) template diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index a69b09712a15f..a73b736f33553 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/platform/onednn_helper.h" #endif -#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h" #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" #include "paddle/fluid/prim/utils/static/desc_tensor.h" @@ -547,70 +546,6 @@ phi::KernelKey BatchNormDoubleGradOp::GetExpectedKernelType( ctx.GetPlace()); } -class BatchNormCompositeGradOpMaker : public prim::CompositeGradOpMakerBase { - using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase; - - public: - void Apply() override { - // inputs and outputs of batch_norm - paddle::Tensor x = this->GetSingleForwardInput("X"); - paddle::Tensor scale = this->GetSingleForwardInput("Scale"); - paddle::Tensor bias = this->GetSingleForwardInput("Bias"); - paddle::Tensor mean = this->GetSingleForwardInput("Mean"); - paddle::Tensor variance = this->GetSingleForwardInput("Variance"); - paddle::Tensor y = this->GetSingleForwardOutput("Y"); - paddle::Tensor mean_out = this->GetSingleForwardOutput("MeanOut"); - paddle::Tensor variance_out = this->GetSingleForwardOutput("VarianceOut"); - paddle::Tensor saved_mean = this->GetSingleForwardOutput("SavedMean"); - paddle::Tensor saved_variance = - this->GetSingleForwardOutput("SavedVariance"); - paddle::optional reserve_space; - - paddle::Tensor y_grad = this->GetSingleOutputGrad("Y"); - paddle::Tensor x_grad = this->GetSingleInputGrad("X"); - paddle::Tensor scale_grad = this->GetSingleInputGrad("Scale"); - paddle::Tensor bias_grad = this->GetSingleInputGrad("Bias"); - - auto dx_ptr = this->GetOutputPtr(&x_grad); - std::string dx_name = this->GetOutputName(x_grad); - auto dscale_ptr = this->GetOutputPtr(&scale_grad); - std::string dscale_name = this->GetOutputName(scale_grad); - auto dbias_ptr = this->GetOutputPtr(&bias_grad); - std::string dbias_name = this->GetOutputName(bias_grad); - - // attrs of batch_norm - auto momentum = this->Attr("momentum"); - auto epsilon = this->Attr("epsilon"); - auto data_layout = this->Attr("data_layout"); - auto is_test = this->Attr("is_test"); - auto use_global_stats = this->Attr("use_global_stats"); - auto trainable_statistics = this->Attr("trainable_statistics"); - - VLOG(3) << "Running batch_norm composite func"; - prim::batch_norm_grad(x, - scale, - bias, - mean_out, - variance_out, - saved_mean, - saved_variance, - reserve_space, - y_grad, - momentum, - epsilon, - data_layout, - is_test, - use_global_stats, - trainable_statistics, - dx_ptr, - dscale_ptr, - dbias_ptr); - this->RecoverOutputName(x_grad, dx_name); - this->RecoverOutputName(scale_grad, dscale_name); - this->RecoverOutputName(bias_grad, dbias_name); - } -}; - DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); } // namespace operators @@ -627,8 +562,7 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, - ops::BatchNormGradMaker, - ops::BatchNormCompositeGradOpMaker); + ops::BatchNormGradMaker); REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 0a18d3d5a710a..bbfe811c7cc61 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -19,9 +19,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/norm_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 8d1b52325de69..f0c6d82a53bbe 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/phi/backends/cpu/cpu_info.h" -#include "paddle/phi/kernels/elementwise_kernel.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 554f7170a6ad5..4059caeca7066 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc index d15f649af131e..fc3500cbd90c6 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/fusion.h" -#include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index dc79d05ab0eb2..2ad9811d4b2cf 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -18,9 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/fc_functor.h" -#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 0ad2b60221d4f..6d162931337fa 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/impl/matmul_kernel_impl.h" namespace paddle { @@ -54,326 +53,6 @@ static phi::DDim ColumnMatrixFromVector(const phi::DDim &y_dim) { return common::make_ddim({y_dim[0], 1}); } -#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) || \ - defined(PADDLE_WITH_HIP) -template -typename std::enable_if::value, void>::type -ComputeMatmulImpl(const framework::ExecutionContext &context) { - auto &dev_ctx = context.template device_context(); - - auto &x = GET_DATA_SAFELY( - context.Input("X"), "Input", "X", "MatMul"); - auto &y = GET_DATA_SAFELY( - context.Input("Y"), "Input", "Y", "MatMul"); - auto *out = context.Output("Out"); - - dev_ctx.template Alloc(out, out->numel() * sizeof(T)); - - phi::MatmulKernel(dev_ctx, - x, - y, - context.Attr("transpose_X"), - context.Attr("transpose_Y"), - out); -} -#endif - -template -typename std::enable_if::value, void>::type -ComputeMatmulImpl(const framework::ExecutionContext &context) { - auto &x = GET_DATA_SAFELY( - context.Input("X"), "Input", "X", "MatMul"); - auto &y = GET_DATA_SAFELY( - context.Input("Y"), "Input", "Y", "MatMul"); - auto *out = context.Output("Out"); - - auto &dev_ctx = context.template device_context(); - dev_ctx.template Alloc(out, out->numel() * sizeof(T)); - - auto blas = phi::funcs::GetBlas(dev_ctx); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x.dims()), 0, context.Attr("transpose_X")); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( - ColumnMatrixFromVector(y.dims()), 0, context.Attr("transpose_Y")); - auto scale = static_cast(context.Attr("alpha")); - - int head_number = 1; -#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - head_number = context.Attr("head_number"); -#endif - - const auto &x_dims = x.dims(); - const auto &y_dims = y.dims(); - if (head_number <= 1 && x_dims.size() == 3 && y_dims.size() <= 2) { - // the transpose_X must be false, if is true, the transpose cost much time - if (!context.Attr("transpose_X")) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } - } -#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_); - - if (head_number > 1) { - blas.MatMulWithHead(x, - mat_dim_a, - y, - mat_dim_b, - scale, - head_number, - out, - T(0), - split_vertical_y); - } else { - blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0)); - } -#else - blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0)); -#endif -} - -template -class MatMulKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - ComputeMatmulImpl(context); - } -}; - -// Reshape a rank-3 tensor from P x M x N to (P * M) x N. -// Identity op if the tensor is not of rank 3. -static phi::DenseTensor FoldInitDims(const phi::DenseTensor &input) { - auto output = input; - auto in_dims = input.dims(); - if (in_dims.size() == 3) { - output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); - } - return output; -} - -// Reshape a rank-3 tensor from P x M x N to M x (P * N). -// (Warning: This requires transposing data and writes into new memory.) -// Identity op if the tensor is not of rank 3. -template -static phi::DenseTensor FoldHeadAndLastDims(const DeviceContext &context, - const phi::DenseTensor &input) { - auto in_dims = input.dims(); - if (in_dims.size() != 3) { - return input; - } - phi::DenseTensor output; - output.Resize({in_dims[1], in_dims[0], in_dims[2]}); - output.mutable_data(context.GetPlace()); - std::vector axis = {1, 0, 2}; - phi::funcs::Transpose trans; - trans(context, input, &output, axis); - output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); - - return output; -} - -/** - * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. - * - * The shape would be [BatchSize, H, W] or [H, W]. - * If transposed, `H,W` will be swapped. - */ -static void ReshapeTensorIntoMatrixSequence( - phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) { - int64_t h = 0, w = 0; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - if (descriptor.batch_size_) { - x->Resize({descriptor.batch_size_, h, w}); - } else { - x->Resize({h, w}); - } -} - -/** - * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor - * Out = matmul(x, y) - * - * This method will first calculate X,Y matrix sequence, and then calculate - * the out shape. - * - * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] - * The out = [BatchSize, H1, W2] - * - * If there is no batch size in `X` and `Y`, the out will be [H1, W2] - * If any of `X` and `Y` has batch size BatchSize, the out will have the - * BatchSize. - */ -static void ReshapeXYOutIntoMatrixSequence(phi::DenseTensor *x, - phi::DenseTensor *y, - phi::DenseTensor *out, - bool trans_x, - bool trans_y) { - auto x_dim = RowMatrixFromVector(x->dims()); - auto y_dim = ColumnMatrixFromVector(y->dims()); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); - if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { - out->Resize({mat_dim_x.height_, mat_dim_y.width_}); - } else { - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, - mat_dim_y.width_}); - } - - ReshapeTensorIntoMatrixSequence(x, mat_dim_x); - ReshapeTensorIntoMatrixSequence(y, mat_dim_y); -} - -// Using dimensional constraints on matrix multiplication, it is -// straight-forward to check the following table for when X and Y -// are both matrices. -// -// transpose_X | False | True | False | True -// transpose_Y | False | False | True | True -// -----------+----------+----------+----------+----------- -// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T -// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T -// -// When X is a vector of size K, we treat it instead as a matrix of shape -// (1, K). Similarly, when Y is a vector of size K, we treat it instead as -// a matrix of shape (K, 1). -// -// When X and Y are both 3-dimensional tensors, then the first dimension -// the batch dimension can be ignored and the exact same formulas apply -// as for two matrices. -// -// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end -// up with formulas like -// -// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj} -// -// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N -// to X: (P * M) x K, dOut: (P * M) x N. -template -class MatMulGradKernel : public framework::OpKernel { - public: - void MatMul(const framework::ExecutionContext &context, - const phi::DenseTensor &a, - bool trans_a, - const phi::DenseTensor &b, - bool trans_b, - phi::DenseTensor *out) const { - out->mutable_data(context.GetPlace()); - auto &dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b); - - int head_number = 1; -#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - if (context.HasAttr("head_number")) { - head_number = context.Attr("head_number"); - } -#endif - - if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) { - // the transpose_X must be false, if is true, the transpose cost much time - if (!trans_a) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } - } - blas.MatMul(a, - mat_dim_a, - b, - mat_dim_b, - static_cast(context.Attr("alpha")), - out, - T(0)); - } - - void CalcInputGrad(const framework::ExecutionContext &context, - const phi::DenseTensor &a, - bool trans_a, - bool is_fold_init_dims_a, - const phi::DenseTensor &b, - bool trans_b, - bool is_fold_init_dims_b, - phi::DenseTensor *out) const { - if (out == nullptr) return; - bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && - out->dims().size() == 2; - if (!need_combine) { - MatMul(context, a, trans_a, b, trans_b, out); - } else { - auto &ctx = context.template device_context(); - MatMul( - context, - is_fold_init_dims_a ? FoldInitDims(a) - : FoldHeadAndLastDims(ctx, a), - trans_a, - is_fold_init_dims_b ? FoldInitDims(b) - : FoldHeadAndLastDims(ctx, b), - trans_b, - out); - } - } - - void Compute(const framework::ExecutionContext &context) const override { - auto x = *context.Input("X"); - auto y = *context.Input("Y"); - auto dout = *context.Input(framework::GradVarName("Out")); - auto *dx = context.Output(framework::GradVarName("X")); - auto *dy = context.Output(framework::GradVarName("Y")); - bool transpose_x = context.Attr("transpose_X"); - bool transpose_y = context.Attr("transpose_Y"); - - ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - phi::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } - } - - phi::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - if (transpose_x && transpose_y) { - CalcInputGrad(context, y, true, true, dout, true, false, dx); - CalcInputGrad(context, dout, true, true, x, true, false, dy); - } else if (transpose_x) { - CalcInputGrad(context, y, false, false, dout, true, false, dx); - CalcInputGrad(context, x, false, false, dout, false, true, dy); - } else if (transpose_y) { - CalcInputGrad(context, dout, false, false, y, false, true, dx); - CalcInputGrad(context, dout, true, true, x, false, true, dy); - } else { - CalcInputGrad(context, dout, false, false, y, true, false, dx); - CalcInputGrad(context, x, true, true, dout, false, true, dy); - } - - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - } - } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - } - } - } -}; - phi::DDim GetDimForInput(const framework::InferShapeContext &ctx, std::string input_name) { auto dim = ctx.GetInputDim(input_name); @@ -386,211 +65,6 @@ phi::DDim GetDimForInput(const framework::InferShapeContext &ctx, return dim; } -template -class MatMulDoubleGradKernel : public framework::OpKernel { - public: - void MatMul(const framework::ExecutionContext &context, - const phi::DenseTensor &a, - bool trans_a, - const phi::DenseTensor &b, - bool trans_b, - bool flag, - phi::DenseTensor *out) const { - out->mutable_data(context.GetPlace()); - auto &dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b); - - int head_number = 1; -#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - head_number = context.Attr("head_number"); -#endif - - if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) { - // the transpose_X must be false, if is true, the transpose cost much time - if (!trans_a) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } - } - blas.MatMul(a, - mat_dim_a, - b, - mat_dim_b, - static_cast(context.Attr("alpha")), - out, - static_cast(flag)); - } - - void CalcInputGrad(const framework::ExecutionContext &context, - const phi::DenseTensor &a, - bool trans_a, - bool is_fold_init_dims_a, - const phi::DenseTensor &b, - bool trans_b, - bool is_fold_init_dims_b, - bool flag, - phi::DenseTensor *out) const { - if (out == nullptr) return; - bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && - out->dims().size() == 2; - if (!need_combine) { - MatMul(context, a, trans_a, b, trans_b, flag, out); - } else { - auto &ctx = context.template device_context(); - MatMul( - context, - is_fold_init_dims_a ? FoldInitDims(a) - : FoldHeadAndLastDims(ctx, a), - trans_a, - is_fold_init_dims_b ? FoldInitDims(b) - : FoldHeadAndLastDims(ctx, b), - trans_b, - flag, - out); - } - } - - void Compute(const framework::ExecutionContext &context) const override { - auto x = *context.Input("X"); - auto y = *context.Input("Y"); - auto dout = *context.Input("DOut"); - auto *ddx = context.Input("DDX"); - auto *ddy = context.Input("DDY"); - - auto *dx = context.Output("DX"); - auto *dy = context.Output("DY"); - auto *ddout = context.Output("DDOut"); - - bool transpose_x = context.Attr("transpose_X"); - bool transpose_y = context.Attr("transpose_Y"); - - ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - - phi::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } - } - - phi::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - phi::DDim ddout_dims; - if (ddout) { - ddout_dims = ddout->dims(); - if (ddout_dims != dout.dims()) { - ddout->Resize(dout.dims()); - } - } - - bool ddout_flag = false; - if (ddx) { - auto ddx_mat = *ddx; - if (ddx_mat.dims() != x.dims()) { - ddx_mat.Resize(x.dims()); - } - if (dy) { - if (transpose_x && transpose_y) { - // dy = dout' * ddx' - CalcInputGrad( - context, dout, true, true, ddx_mat, true, false, false, dy); - } else if (transpose_x) { - // dy = ddx * dout - CalcInputGrad( - context, ddx_mat, false, false, dout, false, true, false, dy); - } else if (transpose_y) { - // dy = dout' * ddx - CalcInputGrad( - context, dout, true, true, ddx_mat, false, true, false, dy); - } else { - // dy = ddx' * dout - CalcInputGrad( - context, ddx_mat, true, true, dout, false, true, false, dy); - } - } - - if (ddout) { - CalcInputGrad(context, - ddx_mat, - transpose_x, - true, - y, - transpose_y, - false, - ddout_flag, - ddout); - ddout_flag = true; - } - } - - if (ddy) { - auto ddy_mat = *ddy; - if (ddy_mat.dims() != y.dims()) { - ddy_mat.Resize(y.dims()); - } - if (dx) { - if (transpose_x && transpose_y) { - // dx = ddy' * dout' - CalcInputGrad( - context, ddy_mat, true, true, dout, true, false, false, dx); - } else if (transpose_x) { - // dx = ddy * dout' - CalcInputGrad( - context, ddy_mat, false, false, dout, true, false, false, dx); - } else if (transpose_y) { - // dx = dout * ddy - CalcInputGrad( - context, dout, false, false, ddy_mat, false, true, false, dx); - } else { - // dx = dout * ddy' - CalcInputGrad( - context, dout, false, false, ddy_mat, true, false, false, dx); - } - } - - if (ddout) { - CalcInputGrad(context, - x, - transpose_x, - true, - ddy_mat, - transpose_y, - false, - ddout_flag, - ddout); - } - } - - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - } - } - - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - } - } - - if (ddout) { - if (ddout_dims != dout.dims()) { - ddout->Resize(ddout_dims); - } - } - } -}; - class MatMulOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc index ed2cb36ceb604..f3d121a6c4996 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/multiary.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc index b1f70b3e2ee38..b356a7a19e727 100644 --- a/paddle/fluid/operators/quantize_linear_op.cc +++ b/paddle/fluid/operators/quantize_linear_op.cc @@ -14,14 +14,9 @@ limitations under the License. */ #include #include "paddle/common/ddim.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/common/transform.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/impl/clip_kernel_impl.h" - -#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/binary.h" #include "paddle/phi/infermeta/multiary.h" diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index b8bb2aa3264e0..7633051a25666 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -26,8 +26,6 @@ limitations under the License. */ #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/reshape_grad_kernel.h" -#include "paddle/phi/kernels/reshape_kernel.h" namespace paddle { namespace framework { class InferShapeContext; @@ -378,135 +376,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel { } }; -class ReshapeKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *out = ctx.Output("Out"); - auto *in = ctx.Input("X"); - - auto list_new_shape_tensor = - ctx.MultiInput("ShapeTensor"); - auto *shape_tensor = - ctx.HasInput("Shape") ? ctx.Input("Shape") : nullptr; - phi::IntArray pt_scalar_shape; - if (!list_new_shape_tensor.empty()) { - // have shape tensor - std::vector pt_vec_shape; - for (auto &tensor : list_new_shape_tensor) { - if (tensor->place().GetType() == phi::AllocationType::GPU || - tensor->place().GetType() == phi::AllocationType::XPU) { - phi::DenseTensor temp; - paddle::framework::TensorCopySync(*tensor, phi::CPUPlace(), &temp); - pt_vec_shape.push_back(std::move(temp)); - } else { - pt_vec_shape.push_back(*tensor); - } - } - pt_scalar_shape = phi::IntArray(pt_vec_shape); - } else if (shape_tensor) { - phi::DenseTensor pt_shape; - if (shape_tensor->place().GetType() == phi::AllocationType::GPU || - shape_tensor->place().GetType() == phi::AllocationType::XPU) { - phi::DenseTensor temp; - paddle::framework::TensorCopySync( - *shape_tensor, phi::CPUPlace(), &temp); - pt_shape = std::move(temp); - } else { - pt_shape = *shape_tensor; - } - pt_scalar_shape = phi::IntArray(pt_shape); - } else { - auto &shape_attr = ctx.Attr>("shape"); - pt_scalar_shape = phi::IntArray(shape_attr); - } - if (ctx.GetPlace().GetType() == phi::AllocationType::CPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeKernel(static_cast(dev_ctx), - *in, - pt_scalar_shape, - out); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (ctx.GetPlace().GetType() == phi::AllocationType::GPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeKernel(static_cast(dev_ctx), - *in, - pt_scalar_shape, - out); - } -#endif -#ifdef PADDLE_WITH_XPU - if (ctx.GetPlace().GetType() == phi::AllocationType::XPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeKernel(static_cast(dev_ctx), - *in, - pt_scalar_shape, - out); - } -#endif - } -}; - -class ReshapeGradKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.Input("X"); - auto *d_out = ctx.Input(framework::GradVarName("Out")); - auto *d_x = ctx.Output(framework::GradVarName("X")); - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - - if (ctx.GetPlace().GetType() == phi::AllocationType::CPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeGradKernel( - static_cast(dev_ctx), *x, *d_out, d_x); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (ctx.GetPlace().GetType() == phi::AllocationType::GPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeGradKernel( - static_cast(dev_ctx), *x, *d_out, d_x); - } -#endif -#ifdef PADDLE_WITH_XPU - if (ctx.GetPlace().GetType() == phi::AllocationType::XPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeGradKernel( - static_cast(dev_ctx), *x, *d_out, d_x); - } -#endif - } -}; - -class ReshapeDoubleGradKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *dd_x = ctx.Input("DDX"); - auto *d_out = ctx.Input("DOut"); - auto *dd_out = ctx.Output("DDOut"); - dd_out->mutable_data(ctx.GetPlace(), dd_x->type()); - - if (ctx.GetPlace().GetType() == phi::AllocationType::CPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeDoubleGradKernel( - static_cast(dev_ctx), *d_out, *dd_x, dd_out); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (ctx.GetPlace().GetType() == phi::AllocationType::GPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeDoubleGradKernel( - static_cast(dev_ctx), *d_out, *dd_x, dd_out); - } -#endif -#ifdef PADDLE_WITH_XPU - if (ctx.GetPlace().GetType() == phi::AllocationType::XPU) { - auto &dev_ctx = ctx.device_context(); - phi::ReshapeDoubleGradKernel( - static_cast(dev_ctx), *d_out, *dd_x, dd_out); - } -#endif - } -}; - // FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape, // the XShape is used to carry the shape and lod of X which will be used in // reshape_grad, in this way, the framework can reuse the memory of X From be626a7eb534901f3a70b36fbea227d0f530da6b Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Mon, 14 Oct 2024 09:51:30 +0800 Subject: [PATCH 100/135] optimize memory for unbalaced virtual pp (#68571) (#68660) --- .../framework/distributed_strategy.proto | 1 + .../fleet/meta_parallel/pipeline_parallel.py | 299 +++++++++++++++--- 2 files changed, 263 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 717fb9d5ae0e5..5979a1a9f88b8 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -88,6 +88,7 @@ message PpConfig { optional bool overlap_p2p_comm = 7 [default = false]; optional bool clear_every_step_cache = 8 [default = false]; optional bool use_batch_p2p_comm = 9 [default = true]; + optional bool best_unbalanced_scheduler = 10 [ default = false ]; } message DygraphShardingConfig { diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 6e48f7c769b2f..c9fbe62edaba5 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -17,7 +17,7 @@ import sys import time import warnings -from collections import defaultdict +from collections import defaultdict, deque from enum import Enum from typing import Callable @@ -1191,6 +1191,13 @@ def __init__(self, layers, hcg, strategy): self._virtual_pp_world_size = self.num_model_chunks self._virtual_pp_rank = 0 self._reset_counter() + self._best_unbalanced_scheduler = self._strategy.hybrid_configs[ + "pp_configs" + ].best_unbalanced_scheduler + if self._best_unbalanced_scheduler: + assert ( + not self._comm_overlap + ), "pp best unbalaced scheduler can not run together with dp/sharding overlap" def _check_sanity(self): assert ( @@ -1255,18 +1262,77 @@ def _get_virtual_pp_rank(self, micro_step, forward): self.accumulate_steps % self.num_stages + self.num_stages ) first_chunk_steps = first_chunk_acc * self.num_model_chunks + if self._best_unbalanced_scheduler: + num_group_last_chunk_forward = ( + (micro_step - first_chunk_acc) // self.num_stages + ) // self.num_model_chunks + misplace_start = ( + first_chunk_acc + + self.num_model_chunks + * self.num_stages + * num_group_last_chunk_forward + ) + misplace_end = ( + self.accumulate_steps % self.num_stages + + num_group_last_chunk_forward * self.num_stages + ) * self.num_model_chunks + self.num_stages + forward_virtual_pp_stage = ( + (micro_step - first_chunk_acc) // self.num_stages + ) % self.num_model_chunks if micro_step < first_chunk_steps: virtual_pp_stage = micro_step // first_chunk_acc + if not forward and self._best_unbalanced_scheduler: + if ( + micro_step + >= first_chunk_acc + + (self.num_model_chunks - 1) * self.num_stages + ): + if forward_virtual_pp_stage == self.num_model_chunks - 1: + virtual_pp_stage = 0 + elif ( + micro_step >= misplace_start + and micro_step < misplace_end + ): + virtual_pp_stage = ( + micro_step - self.num_stages + ) // first_chunk_acc else: + origin_micro_step = micro_step micro_step -= first_chunk_steps virtual_pp_stage = micro_step % ( self.num_stages * self.num_model_chunks ) virtual_pp_stage = virtual_pp_stage // self.num_stages + if not forward and self._best_unbalanced_scheduler: + total_num_forward_step_from_steady = ( + first_chunk_acc + + (self.accumulate_steps - first_chunk_acc) + * self.num_model_chunks + ) + if ( + origin_micro_step <= total_num_forward_step_from_steady + and forward_virtual_pp_stage == self.num_model_chunks - 1 + ): + virtual_pp_stage = 0 + elif ( + misplace_start <= total_num_forward_step_from_steady + and origin_micro_step >= misplace_start + and origin_micro_step < misplace_end + ): + if origin_micro_step < first_chunk_steps + self.num_stages: + virtual_pp_stage = ( + origin_micro_step - self.num_stages + ) // first_chunk_acc + else: + virtual_pp_stage = (micro_step - self.num_stages) % ( + self.num_stages * self.num_model_chunks + ) + virtual_pp_stage = virtual_pp_stage // self.num_stages if not forward: virtual_pp_stage = self.num_model_chunks - virtual_pp_stage - 1 + return virtual_pp_stage def _forward_step_helper(self, micro_dataset, micro_step): @@ -1337,7 +1403,7 @@ def _backward_step_helper(self, micro_step): assert hasattr(self, 'output_tensor_grads') assert ( - len(self.output_tensor_grads[virtual_pp_rank]) == 1 + len(self.output_tensor_grads[virtual_pp_rank]) > 0 ), f"output_tensor_grads is empty for virtual_pp_rank {virtual_pp_rank}" assert len(self.input_tensors[virtual_pp_rank]) > 0 @@ -1420,6 +1486,7 @@ def forward_backward_pipeline( fwd_buffer_queue = queue.Queue() bwd_buffer_queue = queue.Queue() skip_steps = self.accumulate_steps % self.num_stages + last_stage_recv_queue = deque() left_id = skip_steps right_id = left_id + first_chunk_acc * (self.num_model_chunks - 1) @@ -1437,17 +1504,48 @@ def _process_fwd_buffer(step_id, tensor): tensor = None return tensor + def _last_stage_need_recv_next(micro_step): + if micro_step >= first_chunk_acc: + if len(last_stage_recv_queue) == 0: + return False + else: + res = last_stage_recv_queue[0] + if micro_step - res[0] < self.num_stages: + return False + else: + return True + else: + return False + + def _last_stage_recv_pp_rank(micro_step): + if micro_step >= first_chunk_acc: + assert ( + len(last_stage_recv_queue) != 0 + ), "last_stage_recv_queue can't be empty" + virtual_pp_stage = (last_stage_recv_queue.popleft())[1] + return virtual_pp_stage - 1 + else: + return self.num_model_chunks - 1 + def _process_bwd_buffer(step_id, tensor): - if step_id < first_chunk_steps: + if self._best_unbalanced_scheduler: if not self.is_pipeline_first_stage(): bwd_buffer_queue.put(tensor) - if left_id <= step_id < right_id: + if step_id >= left_id and not bwd_buffer_queue.empty(): tensor = bwd_buffer_queue.get() else: tensor = None else: - if self.is_pipeline_first_stage(): - tensor = None + if step_id < first_chunk_steps: + if not self.is_pipeline_first_stage(): + bwd_buffer_queue.put(tensor) + if left_id <= step_id < right_id: + tensor = bwd_buffer_queue.get() + else: + tensor = None + else: + if self.is_pipeline_first_stage(): + tensor = None return tensor per_stage_accumulate_steps = self.accumulate_steps // self.num_stages @@ -1716,6 +1814,18 @@ def _process_bwd_buffer(step_id, tensor): backward_micro_step_id ) + if ( + self._best_unbalanced_scheduler + and self.is_pipeline_last_stage(ignore_virtual=True) + ): + cur_pp_rank = self._get_virtual_pp_rank( + backward_micro_step_id, forward=False + ) + if cur_pp_rank != 0: + last_stage_recv_queue.append( + (backward_micro_step_id, cur_pp_rank) + ) + # first stage doesn't send grad to upstream backward_virtual_pp_rank = self._get_virtual_pp_rank( backward_micro_step_id, forward=False @@ -1728,19 +1838,34 @@ def _process_bwd_buffer(step_id, tensor): recv_next = True if self.is_pipeline_last_stage(ignore_virtual=True): + if self._best_unbalanced_scheduler: + next_backward_virtual_pp_rank = ( + self._get_virtual_pp_rank( + backward_micro_step_id + 1, + forward=False, + ) + ) + if self.is_pipeline_last_stage(ignore_virtual=True): + recv_next = _last_stage_need_recv_next( + backward_micro_step_id + 1 + ) + else: + next_backward_virtual_pp_rank = ( + self._get_virtual_pp_rank( + backward_micro_step_id + 1, + forward=False, + ) + ) + if next_backward_virtual_pp_rank == ( + self.num_model_chunks - 1 + ): + # next chunk is the last chunk, not need to pre recv an output tensor grad + recv_next = False + else: next_backward_virtual_pp_rank = self._get_virtual_pp_rank( backward_micro_step_id + 1, forward=False, ) - if next_backward_virtual_pp_rank == ( - self.num_model_chunks - 1 - ): - # next chunk is the last chunk, not need to pre recv an output tensor grad - recv_next = False - else: - next_backward_virtual_pp_rank = self._get_virtual_pp_rank( - backward_micro_step_id + 1, forward=False - ) ( output_tensor_grad, @@ -1773,6 +1898,17 @@ def _process_bwd_buffer(step_id, tensor): self._record_stamp( "B", backward_micro_step_id, '"E"', forward=False ) + if ( + self._best_unbalanced_scheduler + and self.is_pipeline_last_stage(ignore_virtual=True) + ): + cur_pp_rank = self._get_virtual_pp_rank( + backward_micro_step_id, forward=False + ) + if cur_pp_rank != 0: + last_stage_recv_queue.append( + (backward_micro_step_id, cur_pp_rank) + ) # four directions comm # send output tensor to downstream @@ -1817,14 +1953,25 @@ def _process_bwd_buffer(step_id, tensor): # determine whether to recv grad from downstream recv_next = True - next_backward_virtual_pp_rank = self._get_virtual_pp_rank( - backward_micro_step_id + 1, forward=False - ) - if self.is_pipeline_last_stage(ignore_virtual=True) and ( - next_backward_virtual_pp_rank == (self.num_model_chunks - 1) - ): - # last pp stage and last virtual stage - recv_next = False + if self._best_unbalanced_scheduler: + next_backward_virtual_pp_rank = self._get_virtual_pp_rank( + backward_micro_step_id + 1, + forward=False, + ) + if self.is_pipeline_last_stage(ignore_virtual=True): + recv_next = _last_stage_need_recv_next( + backward_micro_step_id + 1 + ) + else: + next_backward_virtual_pp_rank = self._get_virtual_pp_rank( + backward_micro_step_id + 1, forward=False + ) + if self.is_pipeline_last_stage(ignore_virtual=True) and ( + next_backward_virtual_pp_rank + == (self.num_model_chunks - 1) + ): + # last pp stage and last virtual stage + recv_next = False ( input_tensor, @@ -1841,10 +1988,42 @@ def _process_bwd_buffer(step_id, tensor): self.input_tensors[next_forward_virtual_pp_rank].append( input_tensor ) + # append output_tensor_grad no matter none or not - self.output_tensor_grads[next_backward_virtual_pp_rank].append( - output_tensor_grad - ) + if self._best_unbalanced_scheduler: + if self.is_pipeline_last_stage(ignore_virtual=True): + if recv_next: + recv_next_virtual_pp_rank = _last_stage_recv_pp_rank( + backward_micro_step_id + 1 + ) + self.output_tensor_grads[ + recv_next_virtual_pp_rank + ].append(output_tensor_grad) + if ( + next_backward_virtual_pp_rank + == self.num_model_chunks - 1 + and recv_next_virtual_pp_rank + != next_backward_virtual_pp_rank + ): + self.output_tensor_grads[ + self.num_model_chunks - 1 + ].append(None) + elif ( + next_backward_virtual_pp_rank + == self.num_model_chunks - 1 + ): + self.output_tensor_grads[ + self.num_model_chunks - 1 + ].append(None) + else: + self.output_tensor_grads[ + next_backward_virtual_pp_rank + ].append(output_tensor_grad) + else: + self.output_tensor_grads[next_backward_virtual_pp_rank].append( + output_tensor_grad + ) + self._release_output(output_tensor) assert fwd_buffer_queue.empty(), "forward buffer should be empty" @@ -1893,15 +2072,28 @@ def _process_bwd_buffer(step_id, tensor): input_tensor_grad = self._backward_step_helper(micro_step) self._record_stamp("B", micro_step, '"E"', forward=False) next_backward_virtual_pp_rank = self._get_virtual_pp_rank( - micro_step + 1, forward=False + micro_step + 1, + forward=False, ) + if ( + self._best_unbalanced_scheduler + and self.is_pipeline_last_stage(ignore_virtual=True) + ): + cur_pp_rank = self._get_virtual_pp_rank( + micro_step, forward=False + ) + if cur_pp_rank != 0: + last_stage_recv_queue.append((micro_step, cur_pp_rank)) recv_next = True if self.is_pipeline_last_stage(ignore_virtual=True): - if next_backward_virtual_pp_rank == ( - self.num_model_chunks - 1 - ): - recv_next = False + if self._best_unbalanced_scheduler: + recv_next = _last_stage_need_recv_next(micro_step + 1) + else: + if next_backward_virtual_pp_rank == ( + self.num_model_chunks - 1 + ): + recv_next = False if micro_step == (num_steps - 1): recv_next = False @@ -1912,13 +2104,46 @@ def _process_bwd_buffer(step_id, tensor): ) # append output_tensor_grad no matter none or not - self.output_tensor_grads[next_backward_virtual_pp_rank].append( - self._p2p_helper.send_backward_recv_backward( - input_tensor_grad, - recv_next=recv_next, - batch_p2p_comm=self._use_batch_p2p_comm, + if self._best_unbalanced_scheduler: + if self.is_pipeline_last_stage(ignore_virtual=True): + output_tensor_grad = ( + self._p2p_helper.send_backward_recv_backward( + input_tensor_grad, + recv_next=recv_next, + batch_p2p_comm=self._use_batch_p2p_comm, + ) + ) + if recv_next: + recv_next_virtual_pp_rank = ( + _last_stage_recv_pp_rank(micro_step + 1) + ) + self.output_tensor_grads[ + recv_next_virtual_pp_rank + ].append(output_tensor_grad) + else: + self.output_tensor_grads[ + next_backward_virtual_pp_rank + ].append(output_tensor_grad) + else: + self.output_tensor_grads[ + next_backward_virtual_pp_rank + ].append( + self._p2p_helper.send_backward_recv_backward( + input_tensor_grad, + recv_next=recv_next, + batch_p2p_comm=self._use_batch_p2p_comm, + ) + ) + else: + self.output_tensor_grads[ + next_backward_virtual_pp_rank + ].append( + self._p2p_helper.send_backward_recv_backward( + input_tensor_grad, + recv_next=recv_next, + batch_p2p_comm=self._use_batch_p2p_comm, + ) ) - ) self._sync_overlap_grads() From b4baafae36825d8c2bb7b4acdc0f423c86058604 Mon Sep 17 00:00:00 2001 From: doggy-tao <3160391266@qq.com> Date: Mon, 14 Oct 2024 10:18:46 +0800 Subject: [PATCH 101/135] add backward decomp for logsumexp and trunc (#68485) * add backward decomp for logsumexp and trunc * modified logsumexp_grad * modified some files * add prim_op_type and public_python_api for TestLogsumexpBF16Op() * support dynamic shape for logsumexp_grad * support dynamic shape for trunc_grad & modify TestLogsumexp_FP16 --- paddle/fluid/primitive/codegen/gen.py | 2 + paddle/fluid/primitive/rule/vjp/details.h | 100 ++++++++++++++++++ python/paddle/autograd/backward_utils.py | 4 +- test/legacy_test/test_logsumexp.py | 56 ++++++---- test/legacy_test/test_trunc_op.py | 15 ++- ..._sub_graph_klmno_backward_dynamic_shape.py | 81 ++++++++++++++ ..._sub_graph_pqrst_backward_dynamic_shape.py | 17 +++ 7 files changed, 253 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py index f0e9e57c4697b..bdd3c4f2ce1f9 100644 --- a/paddle/fluid/primitive/codegen/gen.py +++ b/paddle/fluid/primitive/codegen/gen.py @@ -76,6 +76,7 @@ 'sin_grad', 'cos_grad', 'tanh_grad', + 'trunc_grad', 'square_grad', ] @@ -126,6 +127,7 @@ 'unsqueeze_grad', 'where_grad', 'logcumsumexp_grad', + 'logsumexp_grad', ] # whole vjp list of primitive op vjp diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index bedf08ff6e1c0..f39536f599b23 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -2884,6 +2884,106 @@ void logcumsumexp_grad(const Tensor& x, } } +template +void logsumexp_grad(const Tensor& x, + const Tensor& out, + const Tensor& out_grad, + const IntArray& axis, + bool keepdim, + bool reduce_all, + Tensor* x_grad) { + if (x_grad) { + int64_t axis_size = axis.size(); + int64_t x_dim_size = x.dims().size(); + reduce_all = false; + + if (reduce_all || axis_size == 0 || axis_size == x_dim_size) { + reduce_all = true; + } else { + reduce_all = false; + } + + auto x_grad_tmp = Tensor(); + + if (has_dynamic_shape(x.shape())) { + Tensor x_shape = shape(x); + if (x_dim_size == 1) { + x_grad_tmp = backend::expand(out_grad, x_shape) * exp(x - out); + } else { + if (!keepdim) { + auto axis_ = std::vector(); + if (reduce_all) { + for (int64_t i = 0; i < x_dim_size; i++) { + axis_.push_back(i); + } + } else { + axis_ = axis.GetData(); + for (int64_t i = 0; i < axis_size; i++) { + if (axis[i] < 0) { + axis_[i] = axis[i] + x_dim_size; + } + } + } + + auto result_shape = get_unsqueeze_dims(shape(out_grad), axis_); + auto out_ = backend::reshape(out, result_shape); + auto softmax = exp(x - backend::expand(out_, x_shape)); + + auto out_grad_ = backend::reshape(out_grad, result_shape); + x_grad_tmp = backend::expand(out_grad_, x_shape) * softmax; + } else { + x_grad_tmp = backend::expand(out_grad, x_shape) * exp(x - out); + } + } + } else { + std::vector x_dim = common::vectorize(x.dims()); + if (x_dim_size == 1) { + x_grad_tmp = expand(out_grad, IntArray(x_dim)) * exp(x - out); + } else { + if (!keepdim) { + auto axis_ = std::vector(); + if (reduce_all) { + for (int64_t i = 0; i < x_dim_size; i++) { + axis_.push_back(i); + } + } else { + axis_ = axis.GetData(); + for (int64_t i = 0; i < axis_size; i++) { + if (axis[i] < 0) { + axis_[i] = axis[i] + x_dim_size; + } + } + } + auto out_shape = get_unsqueeze_dims(out, axis_); + auto out_ = reshape(out, out_shape); + auto softmax = exp(x - expand(out_, IntArray(x_dim))); + + auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_); + auto out_grad_ = reshape(out_grad, out_grad_shape); + x_grad_tmp = expand(out_grad_, IntArray(x_dim)) * softmax; + } else { + x_grad_tmp = expand(out_grad, IntArray(x_dim)) * exp(x - out); + } + } + } + set_output(x_grad_tmp, x_grad); + } +} + +template +void trunc_grad(const Tensor& out_grad, Tensor* x_grad) { + Tensor zero; + if (x_grad) { + if (has_dynamic_shape(out_grad.shape())) { + zero = backend::full_with_tensor( + shape(out_grad), 0.0, out_grad.dtype()); + } else { + zero = full(out_grad.shape(), 0.0, out_grad.dtype()); + } + set_output(zero, x_grad); + } +} + } // namespace details } // namespace primitive } // namespace paddle diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py index ad470b614d689..5f5865a141d89 100644 --- a/python/paddle/autograd/backward_utils.py +++ b/python/paddle/autograd/backward_utils.py @@ -55,6 +55,7 @@ "pd_op.leaky_relu", "pd_op.log", "pd_op.logcumsumexp", + "pd_op.logsumexp", "pd_op.matmul", "pd_op.max", "pd_op.maximum", @@ -88,8 +89,9 @@ "pd_op.swish", "pd_op.tanh", "pd_op.topk", - "pd_op.unsqueeze", "pd_op.transpose", + "pd_op.trunc", + "pd_op.unsqueeze", "pd_op.where", ] diff --git a/test/legacy_test/test_logsumexp.py b/test/legacy_test/test_logsumexp.py index c36408d8bfbda..714c086a6d8ba 100644 --- a/test/legacy_test/test_logsumexp.py +++ b/test/legacy_test/test_logsumexp.py @@ -57,7 +57,9 @@ def logsumexp_ref_grad(x): class TestLogsumexp(OpTest): def setUp(self): self.op_type = 'logsumexp' + self.prim_op_type = "prim" self.python_api = logsumexp_wrapper + self.public_python_api = logsumexp_wrapper self.shape = [2, 3, 4, 5] self.dtype = 'float64' self.axis = [-1] @@ -87,7 +89,10 @@ def set_attrs_addition(self): pass def test_check_output(self): - self.check_output(check_pir=True) + self.check_output( + check_pir=True, + check_prim_pir=True, + ) def test_check_grad(self): self.check_grad( @@ -96,6 +101,7 @@ def test_check_grad(self): user_defined_grads=self.user_defined_grads, user_defined_grad_outputs=self.user_defined_grad_outputs, check_pir=True, + check_prim_pir=True, ) def calc_grad(self): @@ -165,24 +171,25 @@ def set_attrs(self): self.dtype = 'float16' def test_check_output(self): - ref_x = self.inputs['X'].astype(np.float32) - out_ref = ref_logsumexp(ref_x) - paddle.disable_static() - x = self.inputs['X'].astype(np.float16) - tensor_x = paddle.to_tensor(x) - out_pad = logsumexp_wrapper(tensor_x) - paddle.enable_static() - np.testing.assert_allclose( - out_pad.numpy(), out_ref, rtol=1e-03, atol=1e-08 + place = core.CUDAPlace(0) + self.check_output_with_place( + place, + check_pir=True, + check_prim_pir=True, ) def test_check_grad(self): - self.__class__.dtype = self.dtype - ref_x = self.inputs['X'].astype(np.float32) - ref_x_grad = logsumexp_ref_grad(ref_x) - x = self.inputs['X'].astype(np.float16) - x_grad = logsumexp_op_grad(x) - np.testing.assert_allclose(x_grad, ref_x_grad, rtol=1e-03, atol=1e-05) + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['X'], + 'Out', + check_pir=True, + check_prim_pir=True, + ) + + def set_attrs_addition(self): + pass @unittest.skipIf( @@ -193,7 +200,9 @@ def test_check_grad(self): class TestLogsumexpBF16Op(TestLogsumexp): def setUp(self): self.op_type = 'logsumexp' + self.prim_op_type = "prim" self.python_api = logsumexp_wrapper + self.public_python_api = logsumexp_wrapper self.dtype = np.uint16 self.shape = [2, 3, 4, 5] self.axis = [-1] @@ -213,11 +222,21 @@ def setUp(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place, check_pir=True) + self.check_output_with_place( + place, + check_pir=True, + check_prim_pir=True, + ) def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', check_pir=True) + self.check_grad_with_place( + place, + ['X'], + 'Out', + check_pir=True, + check_prim_pir=True, + ) def set_attrs(self): pass @@ -227,7 +246,6 @@ def set_attrs_addition(self): class TestLogsumexpError(unittest.TestCase): - def test_errors(self): with paddle.static.program_guard(paddle.static.Program()): self.assertRaises(TypeError, paddle.logsumexp, 1) diff --git a/test/legacy_test/test_trunc_op.py b/test/legacy_test/test_trunc_op.py index 4ce4534d67a61..ce452d125784c 100644 --- a/test/legacy_test/test_trunc_op.py +++ b/test/legacy_test/test_trunc_op.py @@ -26,7 +26,9 @@ class TestTruncOp(OpTest): def setUp(self): self.op_type = "trunc" + self.prim_op_type = "prim" self.python_api = paddle.trunc + self.public_python_api = paddle.trunc self.init_dtype_type() np.random.seed(2021) self.inputs = {'X': np.random.random((20, 20)).astype(self.dtype)} @@ -36,10 +38,19 @@ def init_dtype_type(self): self.dtype = np.float64 def test_check_output(self): - self.check_output(check_pir=True) + self.check_output( + check_pir=True, + check_prim_pir=True, + ) def test_check_grad(self): - self.check_grad(['X'], 'Out', numeric_grad_delta=1e-5, check_pir=True) + self.check_grad( + ['X'], + 'Out', + numeric_grad_delta=1e-5, + check_pir=True, + check_prim_pir=True, + ) class TestFloatTruncOp(TestTruncOp): diff --git a/test/prim/pir_prim/test_prim_sub_graph_klmno_backward_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_klmno_backward_dynamic_shape.py index a7e0feb633985..ed231b45fd95a 100644 --- a/test/prim/pir_prim/test_prim_sub_graph_klmno_backward_dynamic_shape.py +++ b/test/prim/pir_prim/test_prim_sub_graph_klmno_backward_dynamic_shape.py @@ -39,6 +39,22 @@ def logcumsumexp_net3(x): return paddle.logcumsumexp(x, axis=-1) +def logsumexp_net1(x): + return paddle.logsumexp(x) + + +def logsumexp_net2(x): + return paddle.logsumexp(x, keepdim=False) + + +def logsumexp_net3(x): + return paddle.logsumexp(x, axis=-1, keepdim=False) + + +def logsumexp_net4(x): + return paddle.logsumexp(x, axis=[0, 2], keepdim=False) + + def matmul_net(x, y): return paddle.matmul(x, y) @@ -143,6 +159,71 @@ def setUp(self): self.tol = 1e-6 +class TestPrimLogsumexpWithGrad1(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2024) + self.op_name = "pd_op.logsumexp_grad" + self.dtype = "float32" + self.x_shape = [1000] + self.init_x_shape = [None] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = logsumexp_net1 + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimLogsumexpWithGrad2(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2024) + self.op_name = "pd_op.logsumexp_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = logsumexp_net2 + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimLogsumexpWithGrad3(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2024) + self.op_name = "pd_op.logsumexp_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = logsumexp_net1 + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimLogsumexpWithGrad4(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2024) + self.op_name = "pd_op.logsumexp_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = logsumexp_net3 + self.enable_cinn = False + self.tol = 1e-6 + + +class TestPrimLogsumexpWithGrad5(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2024) + self.op_name = "pd_op.logsumexp_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, 40] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = logsumexp_net4 + self.enable_cinn = False + self.tol = 1e-6 + + class TestPrimMatmulWithGrad1(TestPrimTwoWithGrad): def setUp(self): np.random.seed(2023) diff --git a/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py index fb56382c99b6f..47163744caeb3 100644 --- a/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py +++ b/test/prim/pir_prim/test_prim_sub_graph_pqrst_backward_dynamic_shape.py @@ -178,6 +178,10 @@ def transpose_net(x): return paddle.transpose(x, perm=[0, 3, 1, 2]) +def trunc_net(x): + return paddle.trunc(x) + + class TestPrimPadWithGrad(TestPrimBaseWithGrad): def setUp(self): np.random.seed(2023) @@ -1091,5 +1095,18 @@ def setUp(self): self.tol = 1e-6 +class TestPrimTruncWithGrad(TestPrimBaseWithGrad): + def setUp(self): + np.random.seed(2024) + self.op_name = "pd_op.trunc_grad" + self.dtype = "float32" + self.x_shape = [30, 200, 40] + self.init_x_shape = [None, None, None] + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.net = trunc_net + self.enable_cinn = False + self.tol = 1e-6 + + if __name__ == "__main__": unittest.main() From a239362c1524cdecbc3074452caaf0ecb47788a6 Mon Sep 17 00:00:00 2001 From: RAM <141618702+gongshaotian@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:29:36 +0800 Subject: [PATCH 102/135] [CINN] Add IdentityOpCleanPass to the Pass execution process of CINN (#68641) * fix bug and refine code * add ! * execute identity_op_clean_pass in cinn * wrapper code * add note --- .../dialect/operator/transforms/add_cinn_pass.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc index 2a1908c3d55db..9973901aad3e7 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc @@ -56,6 +56,7 @@ #include "paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.h" #include "paddle/fluid/pir/transforms/build_cinn_pass.h" #include "paddle/fluid/pir/transforms/general/dead_code_elimination_pass.h" +#include "paddle/fluid/pir/transforms/general/identity_op_clean_pass.h" #include "paddle/fluid/pir/transforms/gpu/fused_gemm_epilogue_pass.h" COMMON_DECLARE_bool(cinn_specify_input_dynamic_dim); @@ -94,6 +95,19 @@ bool HasDynamicShape(const pir::Program& program) { } } // namespace +void ApplyIdentityOpCleanPass( + ::pir::Program* program, + const std::function()>& + CreatePassManager) { + // NOTE(gongshaotian):Before Paddle 3.0, useless full op and scale op are + // inserted at the end of the Program when export models using Paddle. This + // Pass is designed to address the above-mentioned issues encountered when + // open CINN execution in some models that cannot be reexported. + std::shared_ptr pass_manager = CreatePassManager(); + pass_manager->AddPass(pir::CreateIdentityOpCleanPass()); + pass_manager->Run(program); +} + void ApplyShapeOptimizationPass( ::pir::Program* program, const std::function()>& @@ -264,6 +278,7 @@ void ApplyCinnPass(::pir::Program* program, .file_name("original_programs.py") .dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims) .SaveIfFlagEnabled(); + ApplyIdentityOpCleanPass(program, CreatePassManager); ApplyShapeOptimizationPass(program, CreatePassManager); ApplyPdToCinnPass(program, CreatePassManager); ApplyCinnPreprocessPass(program, CreatePassManager); From 38c81acf89c450d3fa1b3b784a12c9e79fcc655a Mon Sep 17 00:00:00 2001 From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:40:06 +0800 Subject: [PATCH 103/135] fix conv2d_grad infer spmd bugs (#68589) * fix conv2d_grad infer spmd bugs * turn on conv2d backward spmd rule test --- paddle/phi/infermeta/spmd_rules/conv2d.cc | 8 ++++++++ test/auto_parallel/semi_auto_parallel_for_conv2d.py | 3 +-- test/cpp/auto_parallel/spmd_rule_test.cc | 6 ++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/paddle/phi/infermeta/spmd_rules/conv2d.cc b/paddle/phi/infermeta/spmd_rules/conv2d.cc index 5771ee4ea94cb..b6a1bbc355efa 100644 --- a/paddle/phi/infermeta/spmd_rules/conv2d.cc +++ b/paddle/phi/infermeta/spmd_rules/conv2d.cc @@ -255,6 +255,10 @@ SpmdInfo Conv2dGradInferSpmdBase(const DistMetaTensor& input, GetReplicatedDistAttr(input_dist_attr_src); input_grad_dist_attr_dst.set_dims_mapping( GetDimsMappingForAxes(input_axes, axis_to_dim_map_1)); + // handle partial for input_grad + std::vector partial_on_m_dim = + ResoluteOutputPartialDimension(axis_to_dim_map_1, input_axes); + input_grad_dist_attr_dst.set_partial_status(partial_on_m_dim); TensorDistAttr filter_dist_attr_dst = CopyTensorDistAttrForOutput(filter_dist_attr_src); filter_dist_attr_dst.set_dims_mapping( @@ -267,6 +271,10 @@ SpmdInfo Conv2dGradInferSpmdBase(const DistMetaTensor& input, GetReplicatedDistAttr(filter_dist_attr_src); filter_grad_dist_attr_dst.set_dims_mapping( GetDimsMappingForAxes(filter_axes, axis_to_dim_map_2)); + // handle partial for filter_grad + std::vector partial_on_n_dim = + ResoluteOutputPartialDimension(axis_to_dim_map_2, filter_axes); + filter_grad_dist_attr_dst.set_partial_status(partial_on_n_dim); TensorDistAttr input_dist_attr_dst = CopyTensorDistAttrForOutput(input_dist_attr_src); input_dist_attr_dst.set_dims_mapping( diff --git a/test/auto_parallel/semi_auto_parallel_for_conv2d.py b/test/auto_parallel/semi_auto_parallel_for_conv2d.py index 0bcf933010c48..586255e33a65f 100644 --- a/test/auto_parallel/semi_auto_parallel_for_conv2d.py +++ b/test/auto_parallel/semi_auto_parallel_for_conv2d.py @@ -35,8 +35,7 @@ def test_conv2d_shard(self): inputs_shape=shapes, inputs_specs=specs, op_func=paddle.nn.functional.conv2d, - # Todo(jeff41404): the spmd rule of conv2d_grad is fixing, after that, we can set with_backward to True. - with_backward=False, + with_backward=True, ) self.check_placements(outputs, [dist.Shard(0)]) diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc index a22e72ff88e95..0b4dc99681ec0 100644 --- a/test/cpp/auto_parallel/spmd_rule_test.cc +++ b/test/cpp/auto_parallel/spmd_rule_test.cc @@ -2187,6 +2187,7 @@ TEST(Conv2dGradSPMDRule, Ctor) { check_dim_mapping(infered_dist_attrs.second[0], {0, -1, -1, -1}); check_dim_mapping(infered_dist_attrs.second[1], {-1, -1, -1, -1}); EXPECT_EQ(is_partial(infered_dist_attrs.first[2]), false); + EXPECT_EQ(is_partial(infered_dist_attrs.second[1]), true); // test 3 input_dist_attr.set_dims_mapping(std::vector({-1, -1, -1, -1})); @@ -2208,6 +2209,7 @@ TEST(Conv2dGradSPMDRule, Ctor) { check_dim_mapping(infered_dist_attrs.second[0], {-1, -1, -1, -1}); check_dim_mapping(infered_dist_attrs.second[1], {0, -1, -1, -1}); EXPECT_EQ(is_partial(infered_dist_attrs.first[2]), false); + EXPECT_EQ(is_partial(infered_dist_attrs.second[0]), true); // test 4 input_dist_attr.set_dims_mapping(std::vector({0, -1, -1, -1})); @@ -2229,6 +2231,8 @@ TEST(Conv2dGradSPMDRule, Ctor) { check_dim_mapping(infered_dist_attrs.second[0], {0, -1, -1, -1}); check_dim_mapping(infered_dist_attrs.second[1], {1, -1, -1, -1}); EXPECT_EQ(is_partial(infered_dist_attrs.first[2]), false); + EXPECT_EQ(is_partial(infered_dist_attrs.second[0]), true); + EXPECT_EQ(is_partial(infered_dist_attrs.second[1]), true); // test 5 input_dist_attr.set_dims_mapping(std::vector({0, 2, -1, -1})); @@ -2252,6 +2256,8 @@ TEST(Conv2dGradSPMDRule, Ctor) { check_dim_mapping(infered_dist_attrs.second[0], {0, 2, -1, -1}); check_dim_mapping(infered_dist_attrs.second[1], {1, 2, -1, -1}); EXPECT_EQ(is_partial(infered_dist_attrs.first[2]), true); + EXPECT_EQ(is_partial(infered_dist_attrs.second[0]), true); + EXPECT_EQ(is_partial(infered_dist_attrs.second[1]), true); } } // namespace auto_parallel From 68453e873f7621d65902937587abf331fb8c497d Mon Sep 17 00:00:00 2001 From: Chen Zhiyang <1792266893@qq.com> Date: Mon, 14 Oct 2024 10:44:28 +0800 Subject: [PATCH 104/135] add cf saveload and flag (#68628) --- paddle/common/flags.cc | 4 ++ .../include/deserialize_utils.h | 29 +++++++++++ .../include/serialize_utils.h | 28 ++++++++++ .../serialize_deserialize/src/ir_serialize.cc | 44 +++++++++------- .../serialize_deserialize/src/patch_util.cc | 52 ++++++++++++------- .../include/core/storage_manager_support.h | 3 +- .../include/dialect/control_flow/ir/cf_type.h | 3 ++ 7 files changed, 124 insertions(+), 39 deletions(-) diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc index fd0cc0024d510..d755e31870e46 100644 --- a/paddle/common/flags.cc +++ b/paddle/common/flags.cc @@ -1965,3 +1965,7 @@ PHI_DEFINE_EXPORTED_bool(fused_multi_transformer_op_use_mbfmha, PHI_DEFINE_EXPORTED_int64(multi_block_attention_min_partition_size, 1024, "The minimum partition size for flash decoding"); + +PHI_DEFINE_EXPORTED_bool(save_cf_stack_op, + false, + "Save cf stack op for higher-order derivatives."); diff --git a/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h index 3305167842810..51d5a0199d318 100644 --- a/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h +++ b/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h @@ -29,6 +29,7 @@ #include "paddle/phi/common/data_type.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/builtin_type.h" +#include "paddle/pir/include/dialect/control_flow/ir/cf_type.h" #include "paddle/utils/flat_hash_map.h" namespace pir { @@ -66,6 +67,10 @@ class AttrTypeReader { static pir::Attribute ReadPaddleDistAttr(const std::string attr_name, Json* attr_json, pir::IrContext* ctx); + + static pir::Type ReadControlFlowType(const std::string type_name, + Json* type_json, + pir::IrContext* ctx); }; template @@ -237,6 +242,9 @@ pir::Type parseType(Json* type_json) { } else if (DECOMPRESS_DIALECT_ID(name.first) == paddle::dialect::DistDialect::name()) { return AttrTypeReader::ReadPaddleDistType(name.second, type_json, ctx); + } else if (DECOMPRESS_DIALECT_ID(name.first) == + pir::ControlFlowDialect::name()) { + return AttrTypeReader::ReadControlFlowType(name.second, type_json, ctx); } else { PADDLE_ENFORCE( false, @@ -695,4 +703,25 @@ pir::Type AttrTypeReader::ReadPaddleDistType(const std::string type_name, } } +pir::Type AttrTypeReader::ReadControlFlowType(const std::string type_name, + Json* type_json, + pir::IrContext* ctx) { + if (type_name == pir::StackType::name()) { + VLOG(8) << "Parse StackType ... "; + return pir::deserializeTypeFromJson(type_json, ctx); + } else if (type_name == pir::InletType::name()) { + VLOG(8) << "Parse InletType ... "; + return pir::deserializeTypeFromJson(type_json, ctx); + } else if (type_name == pir::OutletType::name()) { + VLOG(8) << "Parse OutletType ... "; + return pir::deserializeTypeFromJson(type_json, ctx); + } else { + PADDLE_ENFORCE( + false, + common::errors::InvalidArgument( + "Unknown Type %s for parse controlflow dialect type", type_name)); + return pir::Type(); + } +} + } // namespace pir diff --git a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h index 782c808bb7607..5b39f72138671 100644 --- a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h +++ b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h @@ -28,6 +28,7 @@ #include "paddle/phi/common/data_type.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/builtin_type.h" +#include "paddle/pir/include/dialect/control_flow/ir/cf_type.h" namespace pir { #define COMPRESS_DIALECT_NAME(attr_template) \ @@ -53,6 +54,8 @@ class AttrTypeWriter { static Json WritePaddleDistType(const pir::Type& type); static Json WritePaddleDistAttr(const pir::Attribute& attr); + + static Json WriteControlFlowType(const pir::Type& type); }; /** serializeTypeToJson is a template function to serialize * a pir type to a json object. a pir type may have value or no value @@ -245,6 +248,9 @@ Json writeType(const pir::Type& type) { } else if (type.dialect().name() == paddle::dialect::DistDialect::name()) { VLOG(6) << "write PaddleDistType ... "; return AttrTypeWriter::WritePaddleDistType(type); + } else if (type.dialect().name() == pir::ControlFlowDialect::name()) { + VLOG(6) << "write ControlFlowDialect ... "; + return AttrTypeWriter::WriteControlFlowType(type); } else { PADDLE_ENFORCE( false, @@ -723,4 +729,26 @@ Json AttrTypeWriter::WritePaddleDistAttr(const pir::Attribute& attr) { return Json::object(); } +Json AttrTypeWriter::WriteControlFlowType(const pir::Type& type) { + Json type_json = Json::object(); + if (type.isa()) { + VLOG(8) << "Write StackType ... "; + return pir::serializeTypeToJson( + type.dyn_cast()); + } else if (type.isa()) { + VLOG(8) << "Write InletType ... "; + return pir::serializeTypeToJson( + type.dyn_cast()); + } else if (type.isa()) { + VLOG(8) << "Write OutletType ... "; + return pir::serializeTypeToJson( + type.dyn_cast()); + } else { + PADDLE_ENFORCE(false, + common::errors::InvalidArgument( + "Unknown Type when write controlflow dialect type")); + } + return type_json; +} + } // namespace pir diff --git a/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc b/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc index 7bcf38559b5f3..dc41e7a7d58b4 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h" +#include "paddle/common/flags.h" #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h" #include "paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h" #include "paddle/pir/include/core/dialect.h" @@ -20,6 +21,7 @@ #include "paddle/pir/include/dialect/control_flow/ir/cf_dialect.h" #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h" +COMMON_DECLARE_bool(save_cf_stack_op); namespace pir { Json ProgramWriter::GetProgramJson(const pir::Program* program) { @@ -99,29 +101,31 @@ Json ProgramWriter::WriteBlock(pir::Block* block, Json ops_json = Json::array(); /* delete cf.stack_create / cf.tuple_push */ - std::vector delete_ops; - for (auto op : block->ops()) { - if (op->isa()) { - delete_ops.push_back(op); - } - } - VLOG(6) << "program before delete stack op :" << *(block->parent_program()); - for (auto op : delete_ops) { - VLOG(0) << "Delete cf.stack_create / cf.tuple_push."; - auto stack_op = op->dyn_cast(); - if (stack_op.inlet().HasOneUse()) { - auto tuple_push_op = stack_op.tuple_push_op(); - auto block_in = tuple_push_op->GetParent(); - block_in->erase(*tuple_push_op); + if (!FLAGS_save_cf_stack_op) { + std::vector delete_ops; + for (auto op : block->ops()) { + if (op->isa()) { + delete_ops.push_back(op); + } } - if (stack_op.outlet().HasOneUse()) { - auto tuple_pop_op = stack_op.tuple_pop_op(); - auto block_in = tuple_pop_op->GetParent(); - block_in->erase(*tuple_pop_op); + VLOG(6) << "program before delete stack op :" << *(block->parent_program()); + for (auto op : delete_ops) { + VLOG(0) << "Delete cf.stack_create / cf.tuple_push."; + auto stack_op = op->dyn_cast(); + if (stack_op.inlet().HasOneUse()) { + auto tuple_push_op = stack_op.tuple_push_op(); + auto block_in = tuple_push_op->GetParent(); + block_in->erase(*tuple_push_op); + } + if (stack_op.outlet().HasOneUse()) { + auto tuple_pop_op = stack_op.tuple_pop_op(); + auto block_in = tuple_pop_op->GetParent(); + block_in->erase(*tuple_pop_op); + } + block->erase(*op); } - block->erase(*op); + VLOG(6) << "program after delete stack op :" << *(block->parent_program()); } - VLOG(6) << "program after delete stack op :" << *(block->parent_program()); for (auto op : block->ops()) { auto op_json = WriteOp(*op); ops_json.emplace_back(op_json); diff --git a/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc b/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc index 91d5f37db2426..c3b68f16acad5 100644 --- a/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc +++ b/paddle/fluid/pir/serialize_deserialize/src/patch_util.cc @@ -24,6 +24,7 @@ #include "paddle/phi/common/data_type.h" #include "paddle/pir/include/core/builtin_attribute.h" #include "paddle/pir/include/core/builtin_type.h" +#include "paddle/pir/include/dialect/control_flow/ir/cf_type.h" namespace pir { @@ -168,9 +169,15 @@ std::string GetTypeName(const YAML::Node &action) { Json GetTypeJson(const YAML::Node &action) { Json json; - std::string dialect = DialectIdMap::Instance()->GetCompressDialectId( - pir::BuiltinDialect::name()) + - "."; + std::string builtin_dialect = DialectIdMap::Instance()->GetCompressDialectId( + pir::BuiltinDialect::name()) + + "."; + std::string op_dialect = DialectIdMap::Instance()->GetCompressDialectId( + paddle::dialect::OperatorDialect::name()) + + "."; + std::string cf_dialect = DialectIdMap::Instance()->GetCompressDialectId( + pir::ControlFlowDialect::name()) + + "."; std::string type_name = ""; if (action.IsScalar()) { type_name = action.as(); @@ -181,46 +188,46 @@ Json GetTypeJson(const YAML::Node &action) { } if (type_name == "pir::BoolType") { VLOG(8) << "Get BoolType name."; - json[ID] = dialect + pir::BoolType::name(); + json[ID] = builtin_dialect + pir::BoolType::name(); } else if (type_name == "pir::BFloat16Type") { VLOG(8) << "Get BFloat16Type name."; - json[ID] = dialect + pir::BFloat16Type::name(); + json[ID] = builtin_dialect + pir::BFloat16Type::name(); } else if (type_name == "pir::Float16Type") { VLOG(8) << "Get Float16Type name."; - json[ID] = dialect + pir::Float16Type::name(); + json[ID] = builtin_dialect + pir::Float16Type::name(); } else if (type_name == "pir::Float32Type") { VLOG(8) << "Get Float32Type name."; - json[ID] = dialect + pir::Float32Type::name(); + json[ID] = builtin_dialect + pir::Float32Type::name(); } else if (type_name == "pir::Float64Type") { VLOG(8) << "Get Float64Type name."; - json[ID] = dialect + pir::Float64Type::name(); + json[ID] = builtin_dialect + pir::Float64Type::name(); } else if (type_name == "pir::Int8Type") { VLOG(8) << "Get Int8Type name."; - json[ID] = dialect + pir::Int8Type::name(); + json[ID] = builtin_dialect + pir::Int8Type::name(); } else if (type_name == "pir::UInt8Type") { VLOG(8) << "Get UInt8Type name."; - json[ID] = dialect + pir::UInt8Type::name(); + json[ID] = builtin_dialect + pir::UInt8Type::name(); } else if (type_name == "pir::Int16Type") { VLOG(8) << "Get Int16Type name."; - json[ID] = dialect + pir::Int16Type::name(); + json[ID] = builtin_dialect + pir::Int16Type::name(); } else if (type_name == "pir::Int32Type") { VLOG(8) << "Get Int32Type name."; - json[ID] = dialect + pir::Int32Type::name(); + json[ID] = builtin_dialect + pir::Int32Type::name(); } else if (type_name == "pir::Int64Type") { VLOG(8) << "Get Int64Type name."; - json[ID] = dialect + pir::Int64Type::name(); + json[ID] = builtin_dialect + pir::Int64Type::name(); } else if (type_name == "pir::IndexType") { VLOG(8) << "Get IndexType name."; - json[ID] = dialect + pir::IndexType::name(); + json[ID] = builtin_dialect + pir::IndexType::name(); } else if (type_name == "pir::Complex64Type") { VLOG(8) << "Get Complex64Type name."; - json[ID] = dialect + pir::Complex64Type::name(); + json[ID] = builtin_dialect + pir::Complex64Type::name(); } else if (type_name == "pir::Complex128Type") { VLOG(8) << "Get Complex128Type name."; - json[ID] = dialect + pir::Complex128Type::name(); + json[ID] = builtin_dialect + pir::Complex128Type::name(); } else if (type_name == "pir::VectorType") { VLOG(8) << "Get VectorType name."; - json[ID] = dialect + pir::VectorType::name(); + json[ID] = builtin_dialect + pir::VectorType::name(); json[DATA] = Json::array(); for (size_t i = 0; i < action["default"].size(); i++) { YAML::Node array_value = action["default"][i]; @@ -228,7 +235,7 @@ Json GetTypeJson(const YAML::Node &action) { } } else if (type_name == "pir::DenseTensorType") { VLOG(8) << "Get DenseTensorType name."; - json[ID] = dialect + pir::DenseTensorType::name(); + json[ID] = builtin_dialect + pir::DenseTensorType::name(); Json content = Json::array(); YAML::Node tensor_value = action["default"]; content.push_back(BuildTypeJsonPatch(tensor_value[0])); @@ -242,6 +249,15 @@ Json GetTypeJson(const YAML::Node &action) { content.push_back(tensor_value[4].as()); // offset json[DATA] = content; + } else if (type_name == "pir::StackType") { + VLOG(8) << "Get StackType name."; + json[ID] = cf_dialect + pir::StackType::name(); + } else if (type_name == "pir::InletType") { + VLOG(8) << "Get InletType name."; + json[ID] = cf_dialect + pir::InletType::name(); + } else if (type_name == "pir::OutletType") { + VLOG(8) << "Get OutletType name."; + json[ID] = cf_dialect + pir::OutletType::name(); } return json; } diff --git a/paddle/pir/include/core/storage_manager_support.h b/paddle/pir/include/core/storage_manager_support.h index 614f3938c54e2..8964ab0d1a5f0 100644 --- a/paddle/pir/include/core/storage_manager_support.h +++ b/paddle/pir/include/core/storage_manager_support.h @@ -65,7 +65,8 @@ class StorageHelperBase : public BaseT { using InterfaceList = typename Filter>::Type; - static ConcreteT dyn_cast_impl(BaseT type) { + template + static ConcreteT dyn_cast_impl(T type) { if (type && type.type_id() == TypeId::get()) { return ConcreteT(type.storage()); } diff --git a/paddle/pir/include/dialect/control_flow/ir/cf_type.h b/paddle/pir/include/dialect/control_flow/ir/cf_type.h index 1e76de313d861..cf3e50c4790f2 100644 --- a/paddle/pir/include/dialect/control_flow/ir/cf_type.h +++ b/paddle/pir/include/dialect/control_flow/ir/cf_type.h @@ -30,16 +30,19 @@ class IR_API StackType : public Type::TypeBase { public: using Base::Base; + static std::string name() { return "t_stack"; } }; class IR_API InletType : public Type::TypeBase { public: using Base::Base; + static std::string name() { return "t_inlet"; } }; class IR_API OutletType : public Type::TypeBase { public: using Base::Base; + static std::string name() { return "t_outlet"; } }; } // namespace pir From 361f487bf57b4889c9cf9b0f9e057943e0a571c7 Mon Sep 17 00:00:00 2001 From: waliwali777 Date: Mon, 14 Oct 2024 10:46:51 +0800 Subject: [PATCH 105/135] [ Auto Parallel ]replace softmax_with_cross_entropy with c_softmax_with_cross_entropypass (#68182) --- .../pir/dialect/op_generator/ops_api_gen.py | 2 +- .../c_softmax_with_cross_entropy.cc | 166 +++++++++++++++++ .../spmd_rules/c_softmax_with_cross_entropy.h | 41 +++++ paddle/phi/infermeta/spmd_rules/rules.cc | 4 + paddle/phi/infermeta/spmd_rules/rules.h | 1 + .../yaml/inconsistent/static_backward.yaml | 1 + .../phi/ops/yaml/inconsistent/static_ops.yaml | 1 + .../distributed/auto_parallel/constants.py | 3 + .../auto_parallel/static/engine.py | 9 +- python/paddle/distributed/passes/__init__.py | 3 + ...lel_replace_with_parallel_cross_entropy.py | 95 ++++++++++ test/auto_parallel/pir/CMakeLists.txt | 7 + ...th_parallel_cross_entropy_pass_unittest.py | 174 ++++++++++++++++++ ...eplace_with_parallel_cross_entropy_pass.py | 52 ++++++ .../test_c_softmax_with_cross_entropy_rule.py | 146 +++++++++++++++ 15 files changed, 703 insertions(+), 2 deletions(-) create mode 100644 paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.cc create mode 100644 paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h create mode 100644 python/paddle/distributed/passes/auto_parallel_replace_with_parallel_cross_entropy.py create mode 100644 test/auto_parallel/pir/mp_auto_parallel_replace_with_parallel_cross_entropy_pass_unittest.py create mode 100644 test/auto_parallel/pir/test_auto_parallel_replace_with_parallel_cross_entropy_pass.py create mode 100644 test/auto_parallel/spmd_rules/test_c_softmax_with_cross_entropy_rule.py diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index 3ee47850f4dcb..3e33a77002957 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -98,6 +98,7 @@ 'c_reducescatter', 'c_allreduce_min_', 'c_allreduce_prod_', + 'c_softmax_with_cross_entropy', 'distributed_fused_lamb_init', 'distributed_fused_lamb_init_', 'fetch', @@ -166,7 +167,6 @@ 'c_allreduce_prod', 'c_identity', 'c_reduce_sum', - 'c_softmax_with_cross_entropy', 'c_split', 'comm_init_all', 'decayed_adagrad', diff --git a/paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.cc b/paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.cc new file mode 100644 index 0000000000000..29740b390b8d9 --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h" + +#include "glog/logging.h" + +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" +#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" +#include "paddle/phi/core/distributed/auto_parallel/utils.h" +#include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h" +#include "paddle/phi/infermeta/spmd_rules/utils.h" + +namespace phi::distributed { +using phi::distributed::auto_parallel::str_join; + +void GetCrossEntropyNotations(int x_ndim, + std::string* x_axes_src, + std::string* x_axes_dst, + std::string* label_axes_src, + std::string* label_axes_dst, + std::string* loss_axes, + std::string* softmax_axes_dst) { + std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; + *x_axes_src = GetBroadcastAxes(x_ndim, x_ndim, alphabet); + *x_axes_dst = *x_axes_src; + *label_axes_src = *x_axes_src; + (*label_axes_src)[x_ndim - 1] = '1'; + *label_axes_dst = *label_axes_src; + *loss_axes = *label_axes_src; + *softmax_axes_dst = *x_axes_dst; +} + +SpmdInfo CSoftmaxWithCrossEntropyInferSpmd(const DistMetaTensor& x, + const DistMetaTensor& label, + int ignore_index, + int ring_id, + int rank, + int nranks) { + // Step0: Verify input args based on c_softmax_with_cross_entropy logic + EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(x); + EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(label); + + VLOG(4) << "CSoftmaxWithCrossEntropySPMDRule InferForward Inputs: " + << "X shape: [" << str_join(x_shape) << "], x_dims_mapping_src: [" + << str_join(x_dims_mapping_src) << "]; Label shape: [" + << str_join(label_shape) << "], Label dims mapping: [" + << str_join(label_dims_mapping_src) << "]; ignore_index: [" + << ignore_index << "]"; + + // Step1: Build Einsum Notation + std::string x_axes_src, x_axes_dst, label_axes_src, label_axes_dst, loss_axes, + softmax_axes_dst; + GetCrossEntropyNotations(x_ndim, + &x_axes_src, + &x_axes_dst, + &label_axes_src, + &label_axes_dst, + &loss_axes, + &softmax_axes_dst); + + // Step2: Sharding Propagation + // Step2.1: merge input shardings + std::unordered_map axis_to_dim_map = + ShardingMergeForTensors({{x_axes_src, x_dims_mapping_src}, + {label_axes_src, label_dims_mapping_src}}); + + // Step2.2: infer output dims mappings + TensorDistAttr loss_dist_attr_dst = + CopyTensorDistAttrForOutput(label_dist_attr_src); + loss_dist_attr_dst.set_dims_mapping( + GetDimsMappingForAxes(loss_axes, axis_to_dim_map)); + TensorDistAttr softmax_dist_attr_dst = + CopyTensorDistAttrForOutput(x_dist_attr_src); + softmax_dist_attr_dst.set_dims_mapping( + GetDimsMappingForAxes(softmax_axes_dst, axis_to_dim_map)); + + // Step2.3: update input dims mappings with merged one + TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src); + x_dist_attr_dst.set_dims_mapping( + GetDimsMappingForAxes(x_axes_dst, axis_to_dim_map)); + TensorDistAttr label_dist_attr_dst = + CopyTensorDistAttrForOutput(label_dist_attr_src); + label_dist_attr_dst.set_dims_mapping( + GetDimsMappingForAxes(label_axes_dst, axis_to_dim_map)); + + VLOG(4) << "CSoftmaxWithCrossEntropyInferSpmd:"; + VLOG(4) << "ignore_index: [" << ignore_index << "]."; + + VLOG(4) << "Einsum notation: [" << x_axes_src << "," << label_axes_src + << " --> " << softmax_axes_dst << "," << loss_axes << "].\n"; + + LOG_SPMD_INPUT(x); + LOG_SPMD_INPUT(label); + LOG_SPMD_OUTPUT(softmax_dist_attr_dst); + LOG_SPMD_OUTPUT(loss_dist_attr_dst); + + return SpmdInfo({x_dist_attr_dst, label_dist_attr_dst}, + {softmax_dist_attr_dst, loss_dist_attr_dst}); +} + +SpmdInfo CSoftmaxWithCrossEntropyGradSpmd(const DistMetaTensor& softmax, + const DistMetaTensor& label, + const DistMetaTensor& loss_grad, + int ignore_index, + int ring_id, + int rank, + int nranks) { + EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(softmax); + EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(label); + EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(loss_grad); + + std::string label_axes_src, label_axes_dst, softmax_axes_src, + softmax_axes_dst, loss_grad_axes; + std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; + auto x_axes_src = alphabet.substr(0, loss_grad_ndim); + auto x_axes_dst = x_axes_src; + label_axes_src = x_axes_src; + label_axes_src[loss_grad_ndim - 1] = '1'; + label_axes_dst = label_axes_src; + loss_grad_axes = label_axes_src; + softmax_axes_src = x_axes_dst; + softmax_axes_dst = x_axes_dst; + + std::unordered_map axis_to_dim_map = + ShardingMergeForTensors({{label_axes_src, label_dims_mapping_src}, + {softmax_axes_src, softmax_dims_mapping_src}, + {loss_grad_axes, loss_grad_dims_mapping_src}}); + + auto label_dist_attr_dst = CopyTensorDistAttrForOutput(label_dist_attr_src); + auto label_dims_mapping_dst = + GetDimsMappingForAxes(label_axes_dst, axis_to_dim_map, true); + label_dist_attr_dst.set_dims_mapping(label_dims_mapping_dst); + + auto softmax_dist_attr_dst = + CopyTensorDistAttrForOutput(softmax_dist_attr_src); + auto softmax_dims_mapping_dst = + GetDimsMappingForAxes(softmax_axes_dst, axis_to_dim_map, true); + softmax_dist_attr_dst.set_dims_mapping(softmax_dims_mapping_dst); + + auto loss_grad_dist_attr_dst = + CopyTensorDistAttrForOutput(loss_grad_dist_attr_src); + auto loss_grad_dims_mapping_dst = + GetDimsMappingForAxes(loss_grad_axes, axis_to_dim_map, true); + loss_grad_dist_attr_dst.set_dims_mapping(loss_grad_dims_mapping_dst); + + auto x_grad = CopyTensorDistAttrForOutput(softmax_dist_attr_dst); + x_grad.set_dims_mapping(softmax_dims_mapping_dst); + + LOG_SPMD_INPUT(softmax); + LOG_SPMD_INPUT(label); + LOG_SPMD_INPUT(loss_grad); + LOG_SPMD_OUTPUT(x_grad); + + return {{softmax_dist_attr_dst, label_dist_attr_dst, loss_grad_dist_attr_dst}, + {x_grad}}; +} +} // namespace phi::distributed diff --git a/paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h b/paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h new file mode 100644 index 0000000000000..57cc4dc0bcb8e --- /dev/null +++ b/paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/phi/common/int_array.h" +#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h" +#include "paddle/phi/core/distributed/type_defs.h" + +namespace phi { +namespace distributed { + +SpmdInfo CSoftmaxWithCrossEntropyInferSpmd(const DistMetaTensor& x, + const DistMetaTensor& label, + int ignore_index, + int ring_id, + int rank, + int nranks); + +SpmdInfo CSoftmaxWithCrossEntropyGradSpmd(const DistMetaTensor& softmax, + const DistMetaTensor& label, + const DistMetaTensor& loss_grad, + int ignore_index, + int ring_id, + int rank, + int nranks); +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc index b100620ee0ba7..5496f894257e6 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.cc +++ b/paddle/phi/infermeta/spmd_rules/rules.cc @@ -643,6 +643,10 @@ PD_REGISTER_SPMD_RULE( PD_INFER_SPMD(phi::distributed::CrossEntropyWithSoftmaxInferSpmdStatic), PD_INFER_SPMD(phi::distributed::CrossEntropyWithSoftmaxInferSpmdReverse)); +PD_REGISTER_SPMD_RULE( + c_softmax_with_cross_entropy, + PD_INFER_SPMD(phi::distributed::CSoftmaxWithCrossEntropyInferSpmd)); + // fused_linear_param_grad_add got no reverse infer spmd rule PD_REGISTER_SPMD_RULE( fused_linear_param_grad_add, diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h index f99cb45014560..7d8eb2d76fc39 100644 --- a/paddle/phi/infermeta/spmd_rules/rules.h +++ b/paddle/phi/infermeta/spmd_rules/rules.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/spmd_rules/amp_ops.h" #include "paddle/phi/infermeta/spmd_rules/argmax.h" #include "paddle/phi/infermeta/spmd_rules/c_embedding.h" +#include "paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h" #include "paddle/phi/infermeta/spmd_rules/cast.h" #include "paddle/phi/infermeta/spmd_rules/concat.h" #include "paddle/phi/infermeta/spmd_rules/conv2d.h" diff --git a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml index 8e5533ead94f9..d35e42599707b 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml @@ -113,6 +113,7 @@ output: Tensor(logits_grad) infer_meta : func: CSoftmaxWithCrossEntropyGradInferMeta + spmd_rule : CSoftmaxWithCrossEntropyGradSpmd kernel: func: c_softmax_with_cross_entropy_grad data_type: loss_grad diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index fd52f7d48418b..259399b78ebce 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -1023,6 +1023,7 @@ output: Tensor(softmax), Tensor(loss) infer_meta: func : CSoftmaxWithCrossEntropyInferMeta + spmd_rule : CSoftmaxWithCrossEntropyInferSpmd kernel: func: c_softmax_with_cross_entropy data_type : logits diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py index 3df32cfaece3e..2e34c6893d637 100644 --- a/python/paddle/distributed/auto_parallel/constants.py +++ b/python/paddle/distributed/auto_parallel/constants.py @@ -342,6 +342,9 @@ class _DPOptimizationConfig(TypedDict, total=False): # noqa: PYI049 ) set_field_default_config(MP_OPTIMIZATION, "replace_with_c_embedding", False) +set_field_default_config( + MP_OPTIMIZATION, "replace_with_parallel_cross_entropy", False +) if TYPE_CHECKING: class _MPOptimizationConfig(TypedDict, total=False): # noqa: PYI049 diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index dffd5266b052c..e6b6ae20a1d02 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -726,8 +726,15 @@ def _parallel_pir(self, mode): # TODO(JZ-LIANG) regulization pass with pass management. dist_program = mix_fw_program.clone() apply_mix2dist_pass(dist_program) - set_all_ops_op_role(dist_program.global_block(), OpRole.Forward) + if self._strategy.mp_optimization.replace_with_parallel_cross_entropy: + auto_parallel_replace_with_parallel_cross_entropy_pass = new_pass( + "replace_with_parallel_cross_entropy", {} + ) + auto_parallel_replace_with_parallel_cross_entropy_pass.apply( + [dist_program], [startup_program] + ) + set_all_ops_op_role(dist_program.global_block(), OpRole.Forward) if ( self._strategy.pipeline.enable and self._strategy.pipeline.schedule_mode == "VPP" diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py index 9dfd3d5a8ca90..a85e415ca0d54 100644 --- a/python/paddle/distributed/passes/__init__.py +++ b/python/paddle/distributed/passes/__init__.py @@ -55,6 +55,9 @@ RecomputePass, RecomputeState, ) +from .auto_parallel_replace_with_parallel_cross_entropy import ( # noqa: F401 + AutoParallelReplaceWithParallelCrossEntropyPass, +) from .auto_parallel_sequence_parallel_optimization import ( # noqa: F401 SequenceParallelOptimizationPass, ) diff --git a/python/paddle/distributed/passes/auto_parallel_replace_with_parallel_cross_entropy.py b/python/paddle/distributed/passes/auto_parallel_replace_with_parallel_cross_entropy.py new file mode 100644 index 0000000000000..45c2a59cae9fd --- /dev/null +++ b/python/paddle/distributed/passes/auto_parallel_replace_with_parallel_cross_entropy.py @@ -0,0 +1,95 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +import paddle.distributed as dist + +from ..auto_parallel.static.utils import ( + get_logger, +) +from .pass_base import PassBase, register_pass + +logger = get_logger(logging.INFO) + + +@register_pass("replace_with_parallel_cross_entropy") +class AutoParallelReplaceWithParallelCrossEntropyPass(PassBase): + def __init__(self): + super().__init__() + hcg = dist.fleet.get_hybrid_communicate_group() + self.model_parallel_group = hcg.get_model_parallel_group() + self.tensor_parallel_degree = hcg.get_model_parallel_world_size() + + def _check_self(self): + # The activation of this pass requires adopting a model parallel strategy. + if self.tensor_parallel_degree < 2: + return False + return True + + def _check_conflict(self, other_pass): + return True + + def _check_user(self, value): + placement1 = value.placements + for user in value.all_used_ops(): + for operand in user.operands_source(): + if operand.get_defining_op() != value.get_defining_op(): + continue + placement2 = operand.placements + if placement1 != placement2: + return False + break + return True + + def _apply_single_impl(self, main_program, startup_program, context): + del_ops = [] + new_ops = [] + ring_id = self.model_parallel_group.id + rank = self.model_parallel_group.rank + nranks = self.model_parallel_group.nranks + + for block in main_program.blocks: + for op in reversed(block.ops): + if op.name() == 'pd_op.cross_entropy_with_softmax': + operand1 = op.operand_source(0) + operand2 = op.operand_source(1) + + # The `logit` input of the `cross_stropy_with_stoftmax` operator + # meed split along the column. + placement1 = operand1.placements + if not placement1[1].is_shard(): + return + + ignore_index = op.attrs()["ignore_index"] + paddle.pir.set_insertion_point(op) + softmax, loss = paddle._C_ops.c_softmax_with_cross_entropy( + operand1, operand2, ignore_index, ring_id, rank, nranks + ) + op.result(0).replace_all_uses_with(softmax) + op.result(1).replace_all_uses_with(loss) + del_ops.append(op) + new_ops.append(softmax.get_defining_op()) + + for op in del_ops: + for result in op.results(): + assert result.use_empty() + op.erase() + # In the forward program, the placements of the newly added OP + # output should be consistent with the placements of the user OP input + for op in new_ops: + for result in op.results(): + assert self._check_user(result) + return diff --git a/test/auto_parallel/pir/CMakeLists.txt b/test/auto_parallel/pir/CMakeLists.txt index 37703872c6757..0d23075c98ced 100644 --- a/test/auto_parallel/pir/CMakeLists.txt +++ b/test/auto_parallel/pir/CMakeLists.txt @@ -25,6 +25,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU) test_auto_parallel_c_embedding_pass MODULES test_auto_parallel_c_embedding_pass ENVS FLAGS_enable_pir_api=1 FLAGS_dist_prim_all=1) + py_test_modules( + test_auto_parallel_replace_with_parallel_cross_entropy_pass MODULES + test_auto_parallel_replace_with_parallel_cross_entropy_pass ENVS + FLAGS_enable_pir_api=1 FLAGS_dist_prim_all=1) py_test_modules(test_reshard MODULES test_reshard ENVS FLAGS_enable_pir_api=1) py_test_modules(test_learning_rate MODULES test_learning_rate ENVS FLAGS_enable_pir_api=1) @@ -38,6 +42,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU) PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 60) set_tests_properties(test_auto_parallel_c_embedding_pass PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 300) + set_tests_properties( + test_auto_parallel_replace_with_parallel_cross_entropy_pass + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 60) py_test_modules( test_eliminate_transpose_pass MODULES test_eliminate_transpose_pass ENVS FLAGS_enable_pir_in_executor=1) diff --git a/test/auto_parallel/pir/mp_auto_parallel_replace_with_parallel_cross_entropy_pass_unittest.py b/test/auto_parallel/pir/mp_auto_parallel_replace_with_parallel_cross_entropy_pass_unittest.py new file mode 100644 index 0000000000000..c0698f2f071be --- /dev/null +++ b/test/auto_parallel/pir/mp_auto_parallel_replace_with_parallel_cross_entropy_pass_unittest.py @@ -0,0 +1,174 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import unittest + +import numpy as np + +import paddle +import paddle.distributed as dist +from paddle import nn +from paddle.distributed import fleet +from paddle.io import DataLoader + +BATCH_SIZE = 4 +IMAGE_SIZE = 8 +ignore_index = -100 + + +class DemoNet(nn.Layer): + def __init__(self, mesh): + super().__init__() + self.linear = nn.Linear( + IMAGE_SIZE, + IMAGE_SIZE, + bias_attr=False, + weight_attr=paddle.framework.ParamAttr( + initializer=paddle.nn.initializer.Constant(0.0) + ), + ) + self.linear.weight = dist.shard_tensor( + self.linear.weight, + mesh, + [dist.Replicate(), dist.Shard(1)], + stop_gradient=False, + ) + self.soft = paddle.nn.CrossEntropyLoss( + reduction="none", ignore_index=ignore_index + ) + + def forward(self, x): + out = self.linear(x) + y = paddle.ones(shape=[BATCH_SIZE, 1], dtype='int64') + out = paddle.cast(out, 'float32') + out = self.soft(out, y) + return out + + +class RandomDataset(paddle.io.Dataset): + def __init__(self, images, labels, num_samples): + self.images = images + self.labels = labels + self.num_samples = num_samples + + def __getitem__(self, idx): + return self.images[idx], self.labels[idx] + + def __len__(self): + return self.num_samples + + +class TestMPReplaceWithParallelCrossEntropy(unittest.TestCase): + def setUp(self): + self.atol = 1e-5 + self.set_random_seed(eval(os.getenv("seed"))) + self.mesh = dist.ProcessMesh([[0, 1]], dim_names=["x", "y"]) + paddle.base.set_flags({'FLAGS_enable_pir_api': 1}) + + self.init_dist() + + def init_dist(self): + self.data_loader = self.create_data_loader() + self.dist_loader = dist.shard_dataloader( + dataloader=self.data_loader, + meshes=[self.mesh], + ) + self.loss_fn = nn.MSELoss() + + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 2, + "pp_degree": 1, + } + fleet.init(is_collective=True, strategy=strategy) + + def set_random_seed(self, seed): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + def create_data_loader(self): + images = np.random.rand(BATCH_SIZE, IMAGE_SIZE).astype('float32') + labels = np.random.rand(IMAGE_SIZE, 1).astype('float32') + dataset = RandomDataset(images, labels, BATCH_SIZE) + loader = DataLoader(dataset, batch_size=BATCH_SIZE) + return loader + + def run_dy2static(self, dist_model): + loss_list = [] + dist_model._engine._mode = "train" + dist_model.train() + + for epoch in range(10): + for batch_id, data in enumerate(self.dist_loader()): + if isinstance(data, dict): + image = data['image'] + label = data['label'] + else: + image, label = data + loss = dist_model(image, label) + loss_list += [loss] + + return loss_list, dist_model + + def run_mp(self, use_pass): + paddle.disable_static() + + model = DemoNet(self.mesh) + opt = paddle.optimizer.SGD( + learning_rate=0.1, parameters=model.parameters() + ) + strategy = dist.Strategy() + strategy._mp_optimization.replace_with_parallel_cross_entropy = use_pass + dist_model = dist.to_static( + model, self.dist_loader, self.loss_fn, opt, strategy + ) + losses, dist_model = self.run_dy2static(dist_model) + return losses, dist_model + + def check_results(self, check_losses, ref_losses): + assert len(ref_losses) == len(check_losses) + for i in range(len(ref_losses)): + np.testing.assert_allclose( + ref_losses[i], check_losses[i], self.atol + ) + + def check_program( + self, prog_with_pass, prog_without_pass, rtol=None, atol=None + ): + ops_with_pass = [op.name() for op in prog_with_pass.global_block().ops] + ops_without_pass = [ + op.name() for op in prog_without_pass.global_block().ops + ] + + self.assertIn('pd_op.c_softmax_with_cross_entropy', ops_with_pass) + self.assertIn('pd_op.cross_entropy_with_softmax', ops_without_pass) + + def test_mp_replace_with_parallel_cross_entropy_pass(self): + losses_with_pass, dist_model_with_pass = self.run_mp(True) + losses_without_pass, dist_model_without_pass = self.run_mp(False) + self.check_results(losses_with_pass, losses_without_pass) + prog_with_pass = dist_model_with_pass.dist_main_program() + prog_without_pass = dist_model_without_pass.dist_main_program() + self.check_program(prog_with_pass, prog_without_pass) + + +if __name__ == "__main__": + # NOTE: due to the incompatibility between Llama and GPT models + # in the PIR model, UnitTest was completed using ‘DemoNet’. + # Model testing can be supplemented later. + unittest.main() diff --git a/test/auto_parallel/pir/test_auto_parallel_replace_with_parallel_cross_entropy_pass.py b/test/auto_parallel/pir/test_auto_parallel_replace_with_parallel_cross_entropy_pass.py new file mode 100644 index 0000000000000..0209652d07155 --- /dev/null +++ b/test/auto_parallel/pir/test_auto_parallel_replace_with_parallel_cross_entropy_pass.py @@ -0,0 +1,52 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile +import unittest + +import collective.test_communication_api_base as test_base + + +class TestAutoParallelReplaceWithParallelCrossEntropyPass( + test_base.CommunicationTestDistBase +): + def setUp(self): + super().setUp( + num_of_devices=2, + timeout=300, + ) + self._default_envs = { + "dtype": "float32", + "seed": "2024", + "FLAGS_embedding_deterministic": "1", + "FLAGS_cudnn_deterministic": "1", + } + self._changeable_envs = {"backend": ["gpu"]} + + def test_mlp(self): + envs_list = test_base.gen_product_envs_list( + {"dtype": "float32", "seed": "2024"}, {"backend": ["gpu"]} + ) + for envs in envs_list: + # self._log_dir.name = "./log" + ckpt_path_tmp = tempfile.TemporaryDirectory() + envs["ckpt_path"] = ckpt_path_tmp.name + self.run_test_case( + "mp_auto_parallel_replace_with_parallel_cross_entropy_pass_unittest.py", + user_defined_envs=envs, + ) + ckpt_path_tmp.cleanup() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/auto_parallel/spmd_rules/test_c_softmax_with_cross_entropy_rule.py b/test/auto_parallel/spmd_rules/test_c_softmax_with_cross_entropy_rule.py new file mode 100644 index 0000000000000..fbe83cad1b5d3 --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_c_softmax_with_cross_entropy_rule.py @@ -0,0 +1,146 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import OrderedDict + +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto +from paddle.framework import core + + +class TestCSoftmaxWithCrossEntropySPMDRule(unittest.TestCase): + """ + Unit tests for c_softmax_with_cross_entropy spmd rule. + """ + + def setUp(self): + self.rule1 = core.get_phi_spmd_rule("c_softmax_with_cross_entropy") + process_mesh = auto.ProcessMesh(mesh=[[0, 1], [2, 3]]) + logit_shape = [16, 128, 128] + logit_tensor_dist_attr = TensorDistAttr() + logit_tensor_dist_attr.process_mesh = process_mesh + self.logit_dist_tensor_spec = DistTensorSpec( + logit_shape, logit_tensor_dist_attr + ) + + label_shape = [16, 128, 1] + label_tensor_dist_attr = TensorDistAttr() + label_tensor_dist_attr.process_mesh = process_mesh + self.label_dist_tensor_spec = DistTensorSpec( + label_shape, label_tensor_dist_attr + ) + + softmax_shape = [16, 128, 128] + softmax_tensor_dist_attr = TensorDistAttr() + softmax_tensor_dist_attr.process_mesh = process_mesh + self.softmax_dist_tensor_spec = DistTensorSpec( + softmax_shape, softmax_tensor_dist_attr + ) + + loss_shape = [16, 128, 1] + loss_tensor_dist_attr = TensorDistAttr() + loss_tensor_dist_attr.process_mesh = process_mesh + self.loss_dist_tensor_spec = DistTensorSpec( + loss_shape, loss_tensor_dist_attr + ) + + self.attrs = OrderedDict( + [ + ('ignore_index', -1), + ('ring_id', 0), + ('rank', 0), + ('nranks', 2), + ] + ) + + def test_infer_forward(self): + # llama MP case + # [-1, -1, 1] [-1, -1, -1] (logit, label) --> + # [-1, -1, 1] [-1, -1, -1] (logit, label) + # [-1, -1, 1] [-1, -1, -1] (softmax, loss) + self.logit_dist_tensor_spec.set_dims_mapping([-1, -1, 1]) + self.label_dist_tensor_spec.set_dims_mapping([-1, -1, -1]) + + infered_dist_attr = self.rule1.infer_forward( + self.logit_dist_tensor_spec, + self.label_dist_tensor_spec, + self.attrs['ignore_index'], + self.attrs['ring_id'], + self.attrs['rank'], + self.attrs['nranks'], + ) + + self.assertEqual(len(infered_dist_attr), 2) + infered_input_dist_attr = infered_dist_attr[0] + infered_output_dist_attr = infered_dist_attr[1] + + self.assertEqual(len(infered_input_dist_attr), 2) + self.assertEqual(len(infered_output_dist_attr), 2) + + self.assertEqual( + infered_input_dist_attr[0].dims_mapping, [-1, -1, 1] + ) # logit + self.assertEqual( + infered_input_dist_attr[1].dims_mapping, [-1, -1, -1] + ) # label + self.assertEqual( + infered_output_dist_attr[0].dims_mapping, [-1, -1, 1] + ) # softmax + self.assertEqual( + infered_output_dist_attr[1].dims_mapping, [-1, -1, -1] + ) # loss + + # llama MP-DP case + # [0, -1, 1] [0, -1, -1] (logit, label) --> + # [0, -1, 1] [0, -1, -1] (logit, label) + # [0, -1, 1] [0, -1, -1] (softmax, loss) + self.logit_dist_tensor_spec.set_dims_mapping([0, -1, 1]) + self.label_dist_tensor_spec.set_dims_mapping([0, -1, -1]) + + infered_dist_attr = self.rule1.infer_forward( + self.logit_dist_tensor_spec, + self.label_dist_tensor_spec, + self.attrs['ignore_index'], + self.attrs['ring_id'], + self.attrs['rank'], + self.attrs['nranks'], + ) + + self.assertEqual(len(infered_dist_attr), 2) + infered_input_dist_attr = infered_dist_attr[0] + infered_output_dist_attr = infered_dist_attr[1] + + self.assertEqual(len(infered_input_dist_attr), 2) + self.assertEqual(len(infered_output_dist_attr), 2) + + self.assertEqual( + infered_input_dist_attr[0].dims_mapping, [0, -1, 1] + ) # logit + self.assertEqual( + infered_input_dist_attr[1].dims_mapping, [0, -1, -1] + ) # label + self.assertEqual( + infered_output_dist_attr[0].dims_mapping, [0, -1, 1] + ) # softmax + self.assertEqual( + infered_output_dist_attr[1].dims_mapping, [0, -1, -1] + ) # loss + + +if __name__ == "__main__": + unittest.main() From eb514a6fb4b50487a5fd5c0ac8a358fd8c04c8ef Mon Sep 17 00:00:00 2001 From: wangna11BD <79366697+wangna11BD@users.noreply.github.com> Date: Mon, 14 Oct 2024 11:18:48 +0800 Subject: [PATCH 106/135] fix syncbn with counts (#68604) --- paddle/phi/kernels/funcs/sync_batch_norm_utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h index 5daf32e0112b3..a522fc6e44774 100644 --- a/paddle/phi/kernels/funcs/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/funcs/sync_batch_norm_utils.h @@ -61,16 +61,16 @@ __global__ void KeLocalStats( auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); __syncthreads(); if (threadIdx.x == 0) { - mean_var[k] = out / (N * M); + mean_var[k] = out; } out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum()); __syncthreads(); if (threadIdx.x == 0) { - mean_var[k + C] = out / (N * M); + mean_var[k + C] = out; } } if (blockIdx.x == 0 && threadIdx.x == 0) { - mean_var[2 * C] = static_cast>(1.0); + mean_var[2 * C] = static_cast>(N * M); } } From 4f2198584b96baba4e97c8c1525faca46ad917a6 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Mon, 14 Oct 2024 12:17:27 +0800 Subject: [PATCH 107/135] [Paddle TensorRT] add pd_op.split_with_num and pd_op.split converter (#68608) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * pd_op.split_with_num * split * 修改几个适配add_elementwise_layer * 多写了一个函数get_shape_with_dynamic_shape * fix * pd_op.split * 添加单测 * ci required显示不出来,重新提commit * 修改split_with_num * ci一直不动 --- .../transforms/tensorrt/trt_op_marker_pass.cc | 108 +++++----- python/paddle/tensorrt/converter.py | 22 +- python/paddle/tensorrt/converter_utils.py | 91 ++++++++ python/paddle/tensorrt/impls/manipulation.py | 204 ++++++++++++++++++ python/paddle/tensorrt/util.py | 13 ++ test/tensorrt/tensorrt_test_base.py | 9 +- test/tensorrt/test_converter_manipulation.py | 116 ++++++++++ 7 files changed, 504 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index e8890d6156deb..1cde0e8630e91 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -89,6 +89,13 @@ class Pool2dOpPattern op->attribute(kCanRunTrtAttr).data()) { return false; } + paddle::dialect::FullIntArrayOp full_int_array_op = + pir::GetDefiningOpForInput(op, 1) + ->dyn_cast(); + if (!full_int_array_op) { + VLOG(3) << "Cannot find FullIntArrayOp"; + return false; + } auto padding_attr = op->attribute("paddings"); std::vector paddings; for (const auto &attr : padding_attr.AsVector()) { @@ -122,28 +129,19 @@ class Pool2dOpPattern if (!op->attribute("global_pooling").data()) { if (op->HasAttribute("exclusive")) { if (op->attribute("exclusive").data()) { - paddle::dialect::FullIntArrayOp full_int_array_op = - pir::GetDefiningOpForInput(op, 1) - ->dyn_cast(); - if (!full_int_array_op) { - VLOG(3) << "Cannot find FullIntArrayOp"; - return false; - } else { - auto attr_value = - full_int_array_op->attribute( - "value"); - std::vector kernel_size; - for (const auto &attr : attr_value.AsVector()) { - kernel_size.push_back( - attr.dyn_cast().data()); - } - for (size_t i = 0; i < kernel_size.size(); ++i) { - if (kernel_size[i] <= paddings[i]) { - VLOG(3) << "the padding size should be less than the " - "filter size " - "for exclusive-counting pooling."; - return false; - } + auto attr_value = + full_int_array_op->attribute("value"); + std::vector kernel_size; + for (const auto &attr : attr_value.AsVector()) { + kernel_size.push_back( + attr.dyn_cast().data()); + } + for (size_t i = 0; i < kernel_size.size(); ++i) { + if (kernel_size[i] <= paddings[i]) { + VLOG(3) << "the padding size should be less than the " + "filter size " + "for exclusive-counting pooling."; + return false; } } } @@ -796,12 +794,16 @@ class SplitOpPattern : public pir::OpRewritePattern { return false; } - paddle::dialect::FullOp full_op = - pir::GetDefiningOpForInput(op, 2)->dyn_cast(); - if (!full_op) { - VLOG(3) << "Can not find full op"; + pir::Value axis_tensor = op.operand_source(2); + if (!axis_tensor) { + VLOG(3) << "pd_op.split can not find axis input"; return false; - } else { + } + auto out_vector_type = op.result(0).type().dyn_cast(); + if (pir::GetDefiningOpForInput(op, 2)->isa()) { + paddle::dialect::FullOp full_op = + pir::GetDefiningOpForInput(op, 2) + ->dyn_cast(); auto axis = full_op->attribute("value") .data() .to(); @@ -809,29 +811,25 @@ class SplitOpPattern : public pir::OpRewritePattern { .type() .dyn_cast() .dims(); - auto out_vector_type = op.result(0).type().dyn_cast(); - paddle::dialect::FullIntArrayOp full_sections_op = - pir::GetDefiningOpForInput(op, 1) - ->dyn_cast(); - if (!full_sections_op) { - VLOG(3) << "Can not find FullIntArrayOp"; + axis += (axis < 0) ? x_shape.size() : 0; + + if (x_shape[axis] == -1) { + VLOG(3) << "The (" << axis << ") dim of input should not be -1"; return false; } + } + if (pir::GetDefiningOpForInput(op, 1) + ->isa()) { + paddle::dialect::FullIntArrayOp full_sections_op = + pir::GetDefiningOpForInput(op, 1) + ->dyn_cast(); auto sections = full_sections_op->attribute("value"); - std::vector output_lengths; for (const auto &attr : sections.AsVector()) { output_lengths.push_back(attr.dyn_cast().data()); } - axis += (axis < 0) ? x_shape.size() : 0; - - if (x_shape[axis] == -1) { - VLOG(3) << "The (" << axis << ") dim of input should not be -1"; - return false; - } - if (output_lengths.size() != out_vector_type.size()) { VLOG(3) << "The output_length should be equal to the output size."; return false; @@ -853,33 +851,38 @@ class SplitWithNumOpPattern op->attribute(kCanRunTrtAttr).data()) { return false; } - paddle::dialect::FullOp full_op = - pir::GetDefiningOpForInput(op, 1)->dyn_cast(); - if (!full_op) { - VLOG(3) << "Can not find full op"; + + pir::Value axis_tensor = op.operand_source(1); + if (!axis_tensor) { + VLOG(3) << "pd_op.split_with_num can not find axis input"; return false; - } else { - auto axis = full_op->attribute("value") + } + if (pir::GetDefiningOpForInput(op, 1) + ->isa()) { + paddle::dialect::FullIntArrayOp full_int_array_op = + pir::GetDefiningOpForInput(op, 1) + ->dyn_cast(); + auto axis = full_int_array_op + ->attribute("value") .data() .to(); auto x_shape = op.operand_source(0) .type() .dyn_cast() .dims(); - auto out_vector_type = op.result(0).type().dyn_cast(); axis += (axis < 0) ? x_shape.size() : 0; if (x_shape[axis] == -1) { VLOG(3) << "The (" << axis << ") dim of input should not be -1"; return false; } - if (!op->HasAttribute("num")) { VLOG(3) << "split_with_num op must has num attributes"; return false; } int num = op->attribute("num").data(); std::vector output_lengths; + if (num > 0) { int64_t in_axis_dim = x_shape[axis]; if (in_axis_dim % num != 0) { @@ -893,14 +896,15 @@ class SplitWithNumOpPattern output_lengths.push_back(out_axis_dim); } } - + auto out_vector_type = op.result(0).type().dyn_cast(); if (out_vector_type.size() != output_lengths.size()) { VLOG(3) << "The output_length should be equal to the output size."; return false; } - op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); - return true; } + + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); + return true; } }; class GreaterEqualOpPattern diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 7446f32184870..19f3aebb07116 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -173,6 +173,9 @@ def convert_subgraph_to_trt(self, program, group_op): value_to_trt_tensor[value.id] = input_tensor for op in operations: + # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph. + if op.name() == "builtin.split": + continue operands = [] for operand in op.operands(): source = operand.source() @@ -205,7 +208,18 @@ def convert_subgraph_to_trt(self, program, group_op): trt_outs = self.convert(network, op, operands) + results = [] + for idx, result in enumerate(op.results()): + if result.is_combine(): + used_ops = result.all_used_ops() + for use_op in used_ops: + if use_op.name() == "builtin.split": + split_outputs = use_op.results() + results.extend(split_outputs) + else: + results.append(result) + for idx, result in enumerate(results): if idx < len(trt_outs): value_to_trt_tensor[result.id] = trt_outs[idx] else: @@ -409,14 +423,10 @@ def convert(self, network, paddle_op, inputs): f"Converter for {op_name} not implemented." ) outs = converter_func(network, paddle_op, inputs) - if isinstance(outs, tuple): - return outs - elif isinstance(outs, trt.ITensor): + if isinstance(outs, trt.ITensor): return (outs,) else: - raise TypeError( - f"Expected outputs to be a tuple or ITensor, but got {type(outs)}" - ) + return outs def convert_program_to_trt(self): for op in self.program.global_block().ops: diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py index 0e482697a34b7..337d876b3df2a 100644 --- a/python/paddle/tensorrt/converter_utils.py +++ b/python/paddle/tensorrt/converter_utils.py @@ -213,6 +213,11 @@ def get_shape_tensor_element(network, x, index): return gather_layer.get_output(0) +def trt_less(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.LESS) + return layer.get_output(0) + + def trt_sum(network, a, b): layer = network.add_elementwise(a, b, trt.ElementWiseOperation.SUM) return layer.get_output(0) @@ -231,3 +236,89 @@ def trt_sub(network, a, b): def trt_min(network, a, b): layer = network.add_elementwise(a, b, trt.ElementWiseOperation.MIN) return layer.get_output(0) + + +def trt_mul(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.PROD) + return layer.get_output(0) + + +def trt_div(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.DIV) + return layer.get_output(0) + + +def trt_floor_div(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.FLOOR_DIV) + return layer.get_output(0) + + +def trt_equal(network, a, b): + layer = network.add_elementwise(a, b, trt.ElementWiseOperation.EQUAL) + return layer.get_output(0) + + +def cast_tensor(network, input_tensor, dtype): + layer = network.add_identity(input_tensor) + layer.set_output_type(0, dtype) + return layer.get_output(0) + + +def build_start_tensor(network, rank, axis_tensor, offset): + # Create indices_tensor [0, 1, ..., rank-1] + indices = np.arange(rank, dtype=np.int32) + indices_tensor = network.add_constant([rank], indices).get_output(0) + + # Create mask: mask = (indices == axis_tensor) + mask = network.add_elementwise( + indices_tensor, axis_tensor, trt.ElementWiseOperation.EQUAL + ).get_output(0) + mask_int = cast_tensor(network, mask, trt.int32) + + # Calculate start_tensor = mask_int * offset + start_tensor = network.add_elementwise( + mask_int, offset, trt.ElementWiseOperation.PROD + ).get_output(0) + + return start_tensor + + +def build_size_tensor( + network, rank, axis_tensor, size_value, input_shape_tensor +): + # Create indices_tensor [0, 1, ..., rank-1] + indices = np.arange(rank, dtype=np.int32) + indices_tensor = network.add_constant([rank], indices).get_output(0) + + # Create mask: mask = (indices == axis_tensor) + mask = network.add_elementwise( + indices_tensor, axis_tensor, trt.ElementWiseOperation.EQUAL + ).get_output(0) + mask_int = cast_tensor(network, mask, trt.int32) + + # Create ones_tensor + ones_tensor = network.add_constant( + [rank], np.ones([rank], dtype=np.int32) + ).get_output(0) + + # Calculate inverse_mask = ones_tensor - mask_int + inverse_mask = network.add_elementwise( + ones_tensor, mask_int, trt.ElementWiseOperation.SUB + ).get_output(0) + + # Calculate size_tensor = mask_int * size_value + inverse_mask * input_shape_tensor + size_value_broadcast = network.add_elementwise( + mask_int, size_value, trt.ElementWiseOperation.PROD + ).get_output(0) + + input_shape_broadcast = network.add_elementwise( + inverse_mask, input_shape_tensor, trt.ElementWiseOperation.PROD + ).get_output(0) + + size_tensor = network.add_elementwise( + size_value_broadcast, + input_shape_broadcast, + trt.ElementWiseOperation.SUM, + ).get_output(0) + + return size_tensor diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py index f8b1282ecb23f..203c1816c23ae 100644 --- a/python/paddle/tensorrt/impls/manipulation.py +++ b/python/paddle/tensorrt/impls/manipulation.py @@ -18,13 +18,19 @@ from paddle.tensorrt.converter_utils import ( add_1D_constant_layer, + build_size_tensor, + build_start_tensor, + cast_tensor, get_axes_for_reduce_op, get_positive_dim, get_shape_tensor_element, has_dynamic_shape, trt_concat, + trt_floor_div, + trt_less, trt_max, trt_min, + trt_mul, trt_reshape, trt_shape, trt_sub, @@ -447,3 +453,201 @@ def slice_converter(network, paddle_op, inputs): output_tensor = reshape_layer.get_output(0) return output_tensor + + +@converter_registry.register("pd_op.split_with_num", trt_version="8.x") +def split_with_num_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + input_shape_size = len(input_tensor.shape) + + # Handle the case where axis is of type pir::Value + axis_op = paddle_op.operands()[1].source().get_defining_op() + if axis_op.name() == "pd_op.full": + axis_value = axis_op.attrs()["value"] + axis_tensor = add_1D_constant_layer(network, axis_value) + else: + axis_tensor = inputs[1] + axis_tensor = cast_tensor(network, axis_tensor, trt.int32) + + num_splits = paddle_op.attrs().get("num") + num_splits_tensor = add_1D_constant_layer(network, num_splits) + + # Get the dynamic shape of the input tensor + input_shape_tensor = network.add_shape(input_tensor).get_output(0) + + # Handle negative axis index + input_shape_size_tensor = add_1D_constant_layer(network, input_shape_size) + zero_tensor = add_1D_constant_layer(network, 0) + + is_negative_axis = trt_less(network, axis_tensor, zero_tensor) + is_negative_axis_int = cast_tensor(network, is_negative_axis, trt.int32) + + axis_adjustment = trt_mul( + network, is_negative_axis_int, input_shape_size_tensor + ) + + axis_tensor = trt_sum(network, axis_tensor, axis_adjustment) + + # Get the size of the dimension specified by axis + input_axis_size = network.add_gather( + input_shape_tensor, axis_tensor, axis=0 + ).get_output(0) + + # Compute the size of each split + split_size = trt_floor_div(network, input_axis_size, num_splits_tensor) + + outputs = [] + for idx in range(num_splits): + idx_tensor = add_1D_constant_layer(network, idx) + offset = trt_floor_div(network, idx_tensor, split_size) + + start_tensor = build_start_tensor( + network, input_shape_size, axis_tensor, offset + ) + + size_tensor = build_size_tensor( + network, + input_shape_size, + axis_tensor, + num_splits_tensor, + input_shape_tensor, + ) + + # Create Slice layer + slice_layer = network.add_slice( + input_tensor, + [0] * input_shape_size, + [0] * input_shape_size, + [1] * input_shape_size, + ) + slice_layer.set_input(1, start_tensor) + slice_layer.set_input(2, size_tensor) + + outputs.append(slice_layer.get_output(0)) + + return outputs + + +@converter_registry.register("pd_op.split", trt_version="8.x") +def split_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + input_shape = paddle_op.operands()[0].source().shape + input_shape_size = len(input_shape) + + axis_op = paddle_op.operands()[2].source().get_defining_op() + if axis_op.name() == "pd_op.full": + axis_value = axis_op.attrs()["value"] + axis_tensor = add_1D_constant_layer(network, axis_value) + else: + axis_tensor = inputs[2] + axis_tensor = cast_tensor(network, axis_tensor, trt.int32) + + # Retrieve and process sections + sections_op = paddle_op.operands()[1].source().get_defining_op() + if sections_op.name() == "pd_op.full_int_array": + sections_value = sections_op.attrs()["value"] + section_list = [int(s) for s in sections_value] + dynamic_sections = False + else: + sections_tensor = inputs[1] + dynamic_sections = True + + # Get the dynamic shape of the input tensor + input_shape_tensor = network.add_shape(input_tensor).get_output(0) + + # Handle negative axis index + input_shape_size_tensor = add_1D_constant_layer(network, input_shape_size) + zero_tensor = add_1D_constant_layer(network, 0) + + is_negative_axis = trt_less(network, axis_tensor, zero_tensor) + is_negative_axis_int = cast_tensor(network, is_negative_axis, trt.int32) + + axis_adjustment = trt_mul( + network, is_negative_axis_int, input_shape_size_tensor + ) + axis_tensor = trt_sum(network, axis_tensor, axis_adjustment) + + # Initialize output list + outputs = [] + offset = add_1D_constant_layer(network, 0) + + if not dynamic_sections: + for section_size in section_list: + section_size_tensor = add_1D_constant_layer(network, section_size) + + # Build start_tensor + start_tensor = build_start_tensor( + network, input_shape_size, axis_tensor, offset + ) + + # Build size_tensor + size_tensor = build_size_tensor( + network, + input_shape_size, + axis_tensor, + section_size_tensor, + input_shape_tensor, + ) + # Create Slice layer + slice_layer = network.add_slice( + input_tensor, + [0] * input_shape_size, + [0] * input_shape_size, + [1] * input_shape_size, + ) + slice_layer.set_input(1, start_tensor) + slice_layer.set_input(2, size_tensor) + + outputs.append(slice_layer.get_output(0)) + + # Update offset + offset = network.add_elementwise( + offset, section_size_tensor, trt.ElementWiseOperation.SUM + ).get_output(0) + else: + # If sections is a dynamic tensor + num_sections = sections_tensor.shape[0] + if num_sections == -1: + raise NotImplementedError("dynamic sections not support") + num_sections = int(num_sections) + + for idx in range(num_sections): + idx_tensor = add_1D_constant_layer(network, idx) + + # Get section_size_tensor = sections_tensor[idx] + section_size_tensor = network.add_gather( + sections_tensor, idx_tensor, axis=0 + ).get_output(0) + + # Build start_tensor + start_tensor = build_start_tensor( + network, input_shape_size, axis_tensor, offset + ) + + # Build size_tensor + size_tensor = build_size_tensor( + network, + input_shape_size, + axis_tensor, + section_size_tensor, + input_shape_tensor, + ) + + # Create Slice layer + slice_layer = network.add_slice( + input_tensor, + [0] * input_shape_size, + [0] * input_shape_size, + [1] * input_shape_size, + ) + slice_layer.set_input(1, start_tensor) + slice_layer.set_input(2, size_tensor) + + outputs.append(slice_layer.get_output(0)) + + # Update offset + offset = network.add_elementwise( + offset, section_size_tensor, trt.ElementWiseOperation.SUM + ).get_output(0) + + return outputs diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py index 8ed41d83e15ff..8f50744fb9adc 100644 --- a/python/paddle/tensorrt/util.py +++ b/python/paddle/tensorrt/util.py @@ -110,3 +110,16 @@ def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None): ) paddle.framework.set_flags({"FLAGS_enable_collect_shape": False}) return exe_program + + +# Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph. +def mark_buitlin_op(program): + for op in program.global_block().ops: + if op.name() == "builtin.split": + defining_op = op.operands()[0].source().get_defining_op() + if defining_op is not None: + if ( + defining_op.has_attr("__l_trt__") + and defining_op.attrs()["__l_trt__"] + ): + enforce_op_lower_trt(program, op.name()) diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py index 7db19fe966b75..d9d0030ed194d 100755 --- a/test/tensorrt/tensorrt_test_base.py +++ b/test/tensorrt/tensorrt_test_base.py @@ -21,6 +21,7 @@ from paddle.base import core from paddle.tensorrt.converter import PaddleToTensorRTConverter from paddle.tensorrt.util import ( + mark_buitlin_op, run_pir_pass, warmup_shape_infer, ) @@ -164,7 +165,10 @@ def check_trt_result(self, rtol=1e-5, atol=1e-5): # init all parameter exe.run(startup_program) fetch_num = len(fetch_list) - fetch_index = [v.index() for v in fetch_list] + if isinstance(fetch_list[0], list): + fetch_index = [i for i, v in enumerate(fetch_list)] + else: + fetch_index = [v.index() for v in fetch_list] output_expected = self.run_program(main_program, fetch_list) min_shape_data = dict() # noqa: C408 @@ -229,6 +233,9 @@ def check_trt_result(self, rtol=1e-5, atol=1e-5): # run pir pass(including some fusion pass and trt_op_marker_pass) main_program = run_pir_pass(main_program, partition_mode=False) + # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph. + mark_buitlin_op(main_program) + # run trt_sub_graph_extract_pass() program_with_trt = run_pir_pass(main_program, partition_mode=True) diff --git a/test/tensorrt/test_converter_manipulation.py b/test/tensorrt/test_converter_manipulation.py index ccfee02e18281..bb934e45a3e40 100644 --- a/test/tensorrt/test_converter_manipulation.py +++ b/test/tensorrt/test_converter_manipulation.py @@ -211,5 +211,121 @@ def test_trt_result(self): self.check_trt_result() +class TestSplitWithNumTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.split + self.api_args = { + "x": np.random.randn(3, 9, 5).astype(np.float32), + "num_or_sections": 3, + "axis": 1, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSplitWithNumAxisTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.split + self.api_args = { + "x": np.random.randn(3, 9, 5).astype(np.float32), + "num_or_sections": 3, + "axis": np.array([1]).astype("int32"), + } + self.program_config = {"feed_list": ["x", "axis"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSplitWithNumNegativeAxisTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.split + self.api_args = { + "x": np.random.randn(3, 9, 5).astype(np.float32), + "num_or_sections": 3, + "axis": -2, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSplitTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.split + self.api_args = { + "x": np.random.randn(3, 9, 5).astype("float32"), + "num_or_sections": [2, 4, 3], + "axis": -2, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSplitAxisTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.split + self.api_args = { + "x": np.random.randn(3, 9, 5).astype("float32"), + "num_or_sections": [2, 4, 3], + "axis": np.array([1]).astype("int32"), + } + self.program_config = {"feed_list": ["x", "axis"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + +def split_api(input, num_or_sections, dim): + return _C_ops.split(input, num_or_sections, dim) + + +class TestSplitDynamicSectionsTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = split_api + self.api_args = { + "x": np.random.randn(3, 9, 5).astype("float32"), + "num_or_sections": np.array([2, 4, 3]).astype("int32"), + "axis": 1, + } + self.program_config = {"feed_list": ["x", "num_or_sections"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSplitDynamicSectionAndAxisTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = split_api + self.api_args = { + "x": np.random.randn(3, 9, 5).astype("float32"), + "num_or_sections": np.array([2, 4, 3]).astype("int32"), + "axis": np.array([1]).astype("int32"), + } + self.program_config = {"feed_list": ["x", "num_or_sections", "axis"]} + self.min_shape = {"x": [1, 9, 5]} + self.max_shape = {"x": [3, 9, 5]} + + def test_trt_result(self): + self.check_trt_result() + + if __name__ == '__main__': unittest.main() From 793650bc5c85ab75aa544ff6f5001d7605f5b0dd Mon Sep 17 00:00:00 2001 From: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com> Date: Mon, 14 Oct 2024 14:04:29 +0800 Subject: [PATCH 108/135] [Auto Parallel] add align_mode supporting (#68354) * [Auto Parallel] add align_mode supporting * fix * Update dygraph_sharding_optimizer.py * Update pipeline_parallel.py * Update tensor_fusion_helper.py --- .../dygraph_optimizer/dygraph_sharding_optimizer.py | 5 +++++ .../fleet/meta_parallel/pipeline_parallel.py | 13 +++++++++++++ .../distributed/fleet/utils/tensor_fusion_helper.py | 3 +++ 3 files changed, 21 insertions(+) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py index 461c98c77e308..931e310b4c8dd 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py @@ -342,6 +342,11 @@ def reduce_gradients(self, parameter_list, hcg): ) g_var.scale_(1.0 / sharding_nrank) reduce_op = ReduceOp.SUM + + # In align mode, we scale the grad in advance, so we need a SUM here + if paddle.distributed.in_auto_parallel_align_mode(): + reduce_op = ReduceOp.SUM + param_rank = self._param2rank[param.name] need_check = strtobool( diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index c9fbe62edaba5..359595c285326 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -45,6 +45,7 @@ else: from .pp_utils import p2p_communication as p2p +from paddle.distributed import fleet from paddle.distributed.fleet.utils.tensor_fusion_helper import ( HOOK_ACTION, FusedCommBuffer, @@ -66,6 +67,15 @@ def get_action(is_dp, shard_split_param=False): return HOOK_ACTION.REDUCE +def _get_align_mode_scale(): + hcg = fleet.get_hybrid_communicate_group() + data_parallel_world_size = hcg.get_data_parallel_world_size() + sharding_parallel_world_size = hcg.get_sharding_parallel_world_size() + return max(data_parallel_world_size, 1) * max( + sharding_parallel_world_size, 1 + ) + + # assume only the first stage and last stage need data, and data consumption is ordered # to be replaced by real micro dataset from reader class FakeMicroDataset: @@ -997,6 +1007,9 @@ def _backward_step( ) if self.is_pipeline_last_stage(): assert output_tensor_grad is None + # In align mode, we scale the grad directly after forward + if paddle.distributed.in_auto_parallel_align_mode(): + output_tensor = output_tensor / _get_align_mode_scale() if self.scaler: paddle.autograd.backward(self.scaler.scale(output_tensor)) else: diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index 1c9bde0e7c6b1..b18221da9843b 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -688,6 +688,9 @@ def _comm_grads(self): ) elif self._act == HOOK_ACTION.REDUCE_SCATTER: + # In align mode, we scale the grad in advance, so we need a SUM head + if paddle.distributed.in_auto_parallel_align_mode(): + reduce_op = paddle.distributed.ReduceOp.SUM shard_size = self.grad_storage._numel() // self._comm_group.nranks begin = shard_size * self._comm_group.rank end = begin + shard_size From 5fe67bd8db674e1ee480e1fa2b531383af2148ba Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Mon, 14 Oct 2024 14:10:25 +0800 Subject: [PATCH 109/135] =?UTF-8?q?=E3=80=90CINN=E3=80=91Add=20func=20of?= =?UTF-8?q?=20div=20and=20mod=20for=20IterExpr.=20(#68624)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add div, mod. * fix bug * add headfile in ut * fix codestyle * IndexExpr2Expr in IterMark::Mark. --- paddle/cinn/common/const_fold.h | 36 ++ paddle/cinn/common/iter_simplify.cc | 241 ++++++++++-- paddle/cinn/common/iter_simplify.h | 55 +-- paddle/cinn/common/iter_util.h | 107 +++++ paddle/cinn/ir/ir.cc | 18 +- paddle/cinn/ir/ir.h | 22 +- test/cpp/pir/cinn/adt/CMakeLists.txt | 2 +- ...{Index_expr_test.cc => index_expr_test.cc} | 3 + test/cpp/pir/cinn/adt/iter_simplify_test.cc | 364 ++++++++++++------ 9 files changed, 661 insertions(+), 187 deletions(-) create mode 100644 paddle/cinn/common/iter_util.h rename test/cpp/pir/cinn/adt/{Index_expr_test.cc => index_expr_test.cc} (92%) diff --git a/paddle/cinn/common/const_fold.h b/paddle/cinn/common/const_fold.h index 9630bbffa3963..ca388d2b7e253 100644 --- a/paddle/cinn/common/const_fold.h +++ b/paddle/cinn/common/const_fold.h @@ -71,5 +71,41 @@ inline std::optional TryConstFold(ir::Expr a, ir::Expr b) { return std::nullopt; } +template <> +inline std::optional TryConstFold(ir::Expr a, ir::Expr b) { + const ir::IntImm* pa = a.As(); + const ir::IntImm* pb = b.As(); + const auto& rtype = a.type(); + if (pa && pb) { + int64_t res = pa->value / pb->value; + return cinn::common::make_shared(rtype, res); + } + if (pa) { + if (pa->value == 0) return a; + } + if (pb) { + if (pb->value == 1) return a; + } + return std::nullopt; +} + +template <> +inline std::optional TryConstFold(ir::Expr a, ir::Expr b) { + const ir::IntImm* pa = a.As(); + const ir::IntImm* pb = b.As(); + const auto& rtype = a.type(); + if (pa && pb) { + int64_t res = pa->value % pb->value; + return cinn::common::make_shared(rtype, res); + } + if (pa) { + if (pa->value == 0) return a; + } + if (pb) { + if (pb->value == 1) return ir::Zero(rtype); + } + return std::nullopt; +} + } // namespace common } // namespace cinn diff --git a/paddle/cinn/common/iter_simplify.cc b/paddle/cinn/common/iter_simplify.cc index 3e0763334906b..44f2b7df6e8ae 100644 --- a/paddle/cinn/common/iter_simplify.cc +++ b/paddle/cinn/common/iter_simplify.cc @@ -37,22 +37,21 @@ void IterMapToExprNormalizer::Visit(const Expr* expr, Expr* op) { } } -Expr IterMapToExprNormalizer::ConvertIterSum(ir::IterSum* expr) { - Expr res(0); +ir::IndexExpr IterMapToExprNormalizer::ConvertIterSum(ir::IterSum* expr) { + ir::IndexExpr res(0); for (auto&& arg : expr->args) { auto split = arg.As(); res = res + ConvertIterSplit(split); } - res = IsZero(expr->base) ? res : res + expr->base; + res = res + expr->base; return res; } -Expr IterMapToExprNormalizer::ConvertIterSplit(ir::IterSplit* expr) { +ir::IndexExpr IterMapToExprNormalizer::ConvertIterSplit(ir::IterSplit* expr) { // quick branch - if (IsZero(expr->scale)) return Expr(0); - - Expr source; + if (IsZero(expr->scale)) return ir::IndexExpr(0); + ir::IndexExpr source; ir::IterMark* mark = expr->source.As(); if (auto opt = mark->source.As()) { source = opt; @@ -63,21 +62,19 @@ Expr IterMapToExprNormalizer::ConvertIterSplit(ir::IterSplit* expr) { Visit(&(mark->source), &(mark->source)); source = mark->source; } - Expr res; - if (analyzer_.ProveEQ(expr->extent, mark->extent) && + if (ProveEQ(expr->extent, mark->extent, analyzer_) && IsOne(expr->lower_factor)) { - res = source; - } else if (analyzer_.ProveEQ(mark->extent, - expr->lower_factor * expr->extent)) { + return source * expr->scale; + } else if (ProveLE( + mark->extent, expr->lower_factor * expr->extent, analyzer_)) { if (IsOne(expr->extent) && !IsOne(mark->extent)) { - res = ir::Zero(expr->extent.type()); + return ir::Zero(expr->extent.type()); } - res = source / expr->lower_factor * expr->scale; + return source / expr->lower_factor * expr->scale; } else { - res = (source % (expr->lower_factor * expr->extent)) / expr->lower_factor * - expr->scale; + return (source % (expr->lower_factor * expr->extent)) / expr->lower_factor * + expr->scale; } - return IsOne(expr->scale) ? res : res * expr->scale; } void IterMapRewriter::Visit(const ir::_Var_* op, Expr* expr) { @@ -109,7 +106,7 @@ void IterMapRewriter::Visit(const ir::Add* op, Expr* expr) { } else if (auto b_split = b.As()) { AddToLhs(ret_sum, *b_split, 1); } else { - ret_sum->base = ret_sum->base + b; + ret_sum->base = ret_sum->base + b.as_index(); } *expr = ret; } @@ -135,7 +132,7 @@ void IterMapRewriter::Visit(const ir::Sub* op, Expr* expr) { } else if (auto* b_split = b.As()) { AddToLhs(ret_sum, *b_split, -1); } else { - ret_sum->base = ret_sum->base - b; + ret_sum->base = ret_sum->base - b.as_index(); } *expr = ret; @@ -158,7 +155,8 @@ void IterMapRewriter::Visit(const ir::Mul* op, Expr* expr) { if ((a.As() || a.As()) && (b.As() || b.As())) { PADDLE_THROW(::common::errors::InvalidArgument( - "product of iter and iter is not supported")); + "Product of iter and iter is not supported")); + return; } if (!a.As() && !a.As()) { @@ -171,13 +169,204 @@ void IterMapRewriter::Visit(const ir::Mul* op, Expr* expr) { MulToLhs(a_sum, b); } else if (auto a_split = ret.As()) { - a_split->scale = a_split->scale * b; + a_split->scale = a_split->scale * b.as_index(); + } + + *expr = ret; +} + +void IterMapRewriter::Visit(const ir::Div* op, Expr* expr) { + auto a = op->a(); + auto b = op->b(); + + Visit(&a); + Visit(&b); + + if (auto const_res = cinn::common::TryConstFold(a, b)) { + *expr = const_res.value(); + return; + } + + if (!IsIterExpr(a, b)) return; + + if ((b.As() || b.As())) { + PADDLE_THROW(::common::errors::InvalidArgument( + "Division of iter and iter is not supported")); + return; + } + + auto ret = ir::ir_utils::IRCopy(a); + + auto preprocessed = PreprocessDividend(ret); + auto preprocessed_sum = preprocessed.As(); + + ret = SplitDivConst(preprocessed_sum->args[0], preprocessed_sum->base, b); + + *expr = ret; +} + +void IterMapRewriter::Visit(const ir::Mod* op, Expr* expr) { + auto a = op->a(); + auto b = op->b(); + + Visit(&a); + Visit(&b); + + if (auto const_res = cinn::common::TryConstFold(a, b)) { + *expr = const_res.value(); + return; + } + + if (!IsIterExpr(a, b)) return; + + if ((b.As() || b.As())) { + PADDLE_THROW(::common::errors::InvalidArgument( + "Mod of iter and iter is not supported")); + return; } + auto ret = ir::ir_utils::IRCopy(a); + + auto preprocessed = PreprocessDividend(ret); + auto preprocessed_sum = preprocessed.As(); + + ret = SplitModConst(preprocessed_sum->args[0], preprocessed_sum->base, b); + *expr = ret; } -Expr IterMapRewriter::ToIterSum(const Expr& expr) { +Expr IterMapRewriter::PreprocessDividend(const Expr& dividend) { + if (dividend.As()) { + return ir::IterSum::Make({dividend}, ir::Zero(dividend.type())); + } else if (auto sum = dividend.As()) { + if (sum->args.size() == 1) { + return dividend; + } + // TODO(liuruyan): number of split in sum is greater then 1, Do `tryFuse` in + // latter. + auto fused = dividend; + return fused; + } else { + PADDLE_THROW( + ::common::errors::InvalidArgument("Expect dividend is IterExpr.")); + return Expr(); + } +} + +ir::IndexExpr IterMapRewriter::SplitDivConst(ir::IndexExpr lhs_expr, + ir::IndexExpr base, + ir::IndexExpr rhs) { + // (lhs_expr + base) // rhs + if (IsOne(rhs)) { + if (IsZero(base)) return lhs_expr; + return ir::IterSum::Make({lhs_expr}, base); + } + + auto lhs = lhs_expr.As(); + if (!IsOne(lhs->scale)) { + if (ProveDivisible(lhs->scale, rhs, analyzer_) && IsZero(base)) { + lhs->scale = lhs->scale / rhs; + return lhs; + } else if (ProveDivisible(lhs->scale, rhs, analyzer_) && + ProveDivisible(base, rhs, analyzer_)) { + lhs->scale = lhs->scale / rhs; + return ir::IterSum::Make({lhs}, base / rhs); + } else if (ProveDivisible(rhs, lhs->scale, analyzer_) && IsZero(base)) { + rhs = rhs / lhs->scale; + lhs->scale = ir::One(rhs.type()); + } else if (ProveDivisible(rhs, lhs->scale, analyzer_) && + ProveDivisible(base, lhs->scale, analyzer_)) { + base = base / lhs->scale; + rhs = rhs / lhs->scale; + lhs->scale = ir::One(rhs.type()); + } else { + PADDLE_THROW(::common::errors::InvalidArgument( + "IterExpr scale must be divisible by rhs")); + return ir::IndexExpr(); + } + } + + // TODO(liuruyan): Padding dividend to divisor later. assuming dividend canbe + // divided by divisor now. + + ir::IndexExpr new_split; + if (!ProveDivisible(base, rhs, analyzer_)) { + // padding base to divisor later. Treat the whole expr as IterMark now. + return ir::IterSum::Make( + {ir::IterSplit::Make( + ir::IterMark::Make(ir::IterSum::Make({ir::IndexExpr(lhs)}, base), + lhs->extent + base), + rhs, + (lhs->extent + base + rhs - 1) / rhs, + ir::One(rhs.type()))}, + ir::Zero(rhs.type())); + } + + if (ProveDivisible(lhs->extent, rhs, analyzer_)) { + new_split = ir::IterSplit::Make( + lhs->source, lhs->lower_factor * rhs, lhs->extent / rhs, lhs->scale); + } else if (IsOne(lhs->lower_factor) && + ProveEQ(lhs->extent, + lhs->source.As()->extent, + analyzer_)) { + new_split = ir::IterSplit::Make( + lhs->source, rhs, (lhs->extent + rhs - 1) / rhs, lhs->scale); + } else { + new_split = ir::IterSplit::Make(ir::IterMark::Make(lhs, lhs->extent), + rhs, + (lhs->extent + rhs - 1) / rhs, + ir::One(rhs.type())); + } + + return ir::IterSum::Make({new_split}, base / rhs); +} + +ir::IndexExpr IterMapRewriter::SplitModConst(ir::IndexExpr lhs_expr, + ir::IndexExpr base, + ir::IndexExpr rhs) { + // (lhs_expr + base) % rhs + if (IsOne(rhs)) { + return ir::Zero(lhs_expr.type()); + } + + auto lhs = lhs_expr.As(); + if (!IsOne(lhs->scale)) { + if (ProveDivisible(lhs->scale, rhs, analyzer_) && IsZero(base)) { + return ir::Zero(lhs_expr.type()); + } else if (ProveDivisible(lhs->scale, rhs, analyzer_) && + ProveDivisible(base, rhs, analyzer_)) { + return ir::Zero(lhs_expr.type()); + } else if (ProveDivisible(rhs, lhs->scale, analyzer_) && IsZero(base)) { + rhs = rhs / lhs->scale; + } else if (ProveDivisible(rhs, lhs->scale, analyzer_) && + ProveDivisible(base, lhs->scale, analyzer_)) { + base = base / lhs->scale; + rhs = rhs / lhs->scale; + } else { + PADDLE_THROW(::common::errors::InvalidArgument( + "IterExpr scale must be divisible by rhs")); + return ir::IndexExpr(); + } + } + + if (!ProveDivisible(base, rhs, analyzer_)) { + auto lhs_s1 = ir::IterSplit::Make( + lhs->source, lhs->lower_factor, lhs->extent, ir::One(lhs_expr.type())); + // padding base to divisor later. Treat the whole expr as IterMark now. + return ir::IterSplit::Make( + ir::IterMark::Make(ir::IterSum::Make({lhs_s1}, base), + lhs->extent + base), + ir::One(rhs.type()), + rhs, + lhs->scale); + } + // TODO(liuruyan): Padding dividend to divisor later. assuming dividend canbe + // divided by divisor now. + + return ir::IterSplit::Make(lhs->source, lhs->lower_factor, rhs, lhs->scale); +} + +ir::IndexExpr IterMapRewriter::ToIterSum(const Expr& expr) { if (expr.As()) { return expr; } else if (auto split = expr.As()) { @@ -192,7 +381,7 @@ Expr IterMapRewriter::ToIterSum(const Expr& expr) { void IterMapRewriter::AddToLhs(ir::IterSum* lhs, const ir::IterSplit& rhs, int sign) { - auto rhs_expr = ir::ir_utils::IRCopy(Expr(const_cast(&rhs))); + auto rhs_expr = ir::ir_utils::IRCopy(Expr(&Reference(&rhs))); for (auto&& lvalue : lhs->args) { if (lvalue == rhs_expr) { auto lsplit = lvalue.As(); @@ -209,7 +398,7 @@ void IterMapRewriter::AddToLhs(ir::IterSum* lhs, lhs->args.push_back(rhs_expr); } else { rhs_expr.As()->scale = - ir::Zero(rhs.scale.type()) - rhs.scale; + ir::Zero(rhs.scale.type()).as_index() - rhs.scale; lhs->args.push_back(rhs_expr); } } @@ -228,12 +417,12 @@ void IterMapRewriter::AddToLhs(ir::IterSum* lhs, } } -void IterMapRewriter::MulToLhs(ir::IterSum* lhs, const Expr& rhs) { +void IterMapRewriter::MulToLhs(ir::IterSum* lhs, const ir::IndexExpr& rhs) { for (auto&& lvalue : lhs->args) { auto lsplit = lvalue.As(); lsplit->scale = lsplit->scale * rhs; } - lhs->base = IsZero(lhs->base) ? lhs->base : lhs->base * rhs; + lhs->base = lhs->base * rhs; } } // namespace common diff --git a/paddle/cinn/common/iter_simplify.h b/paddle/cinn/common/iter_simplify.h index fb9f20ced599a..cedb366cfa3b7 100644 --- a/paddle/cinn/common/iter_simplify.h +++ b/paddle/cinn/common/iter_simplify.h @@ -17,6 +17,7 @@ #include #include #include "paddle/cinn/common/integer_set.h" +#include "paddle/cinn/common/iter_util.h" #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/ir_mutator.h" @@ -24,27 +25,9 @@ namespace cinn { namespace common { -bool IsIterExpr(const Expr& a, const Expr& b) { - return a.As() || a.As() || - b.As() || b.As(); -} - -bool IsOne(const Expr& expr) { - if (expr.is_constant() && expr.get_constant() == 1) { - return true; - } - return false; -} -bool IsZero(const Expr& expr) { - if (expr.is_constant() && expr.get_constant() == 0) { - return true; - } - return false; -} - class IterMapToExprNormalizer : public ir::IRMutator<> { public: - explicit IterMapToExprNormalizer(SymbolicExprAnalyzer analyzer) + explicit IterMapToExprNormalizer(const SymbolicExprAnalyzer& analyzer) : analyzer_(analyzer) {} void Convert(Expr* expr) { Visit(expr, expr); } @@ -52,22 +35,25 @@ class IterMapToExprNormalizer : public ir::IRMutator<> { private: void Visit(const Expr* expr, Expr* op) override; - Expr ConvertIterSum(ir::IterSum* expr); + ir::IndexExpr ConvertIterSum(ir::IterSum* expr); - Expr ConvertIterSplit(ir::IterSplit* expr); + ir::IndexExpr ConvertIterSplit(ir::IterSplit* expr); private: - common::SymbolicExprAnalyzer& analyzer_; + common::SymbolicExprAnalyzer analyzer_; }; class IterMapRewriter : public ir::IRMutator<> { public: - explicit IterMapRewriter(const std::vector& input_iters) { + explicit IterMapRewriter(const std::vector& input_iters, + const SymbolicExprAnalyzer& analyzer) + : analyzer_(analyzer) { for (const auto& iter : input_iters) { if (IsOne(iter->upper_bound)) { var_map_[iter->name] = ir::IterSum::Make({}, iter->lower_bound); } else if (IsZero(iter->lower_bound)) { - auto tmp = ir::IterMark::Make(Expr(iter.ptr()), iter->upper_bound); + auto tmp = + ir::IterMark::Make(ir::IndexExpr(iter.ptr()), iter->upper_bound); auto mark = tmp.As(); var_map_[iter->name] = ir::IterSplit::Make(tmp); input_marks_.push_back(*mark); @@ -93,17 +79,32 @@ class IterMapRewriter : public ir::IRMutator<> { void Visit(const ir::Mul* op, Expr* expr) override; + void Visit(const ir::Div* op, Expr* expr) override; + + void Visit(const ir::Mod* op, Expr* expr) override; + private: - static Expr ToIterSum(const Expr& expr); + static ir::IndexExpr ToIterSum(const Expr& expr); static void AddToLhs(ir::IterSum* lhs, const ir::IterSplit& rhs, int sign); static void AddToLhs(ir::IterSum* lhs, const ir::IterSum& rhs, int sign); - static void MulToLhs(ir::IterSum* lhs, const Expr& rhs); + static void MulToLhs(ir::IterSum* lhs, const ir::IndexExpr& rhs); + + Expr PreprocessDividend(const Expr& dividend); + + ir::IndexExpr SplitDivConst(ir::IndexExpr lhs, + ir::IndexExpr base, + ir::IndexExpr rhs); + + ir::IndexExpr SplitModConst(ir::IndexExpr lhs, + ir::IndexExpr base, + ir::IndexExpr rhs); - std::unordered_map var_map_; + std::unordered_map var_map_; std::vector input_marks_; + common::SymbolicExprAnalyzer analyzer_; }; } // namespace common diff --git a/paddle/cinn/common/iter_util.h b/paddle/cinn/common/iter_util.h new file mode 100644 index 0000000000000..2d4e8f50c0387 --- /dev/null +++ b/paddle/cinn/common/iter_util.h @@ -0,0 +1,107 @@ +// Copyright (c) 2024 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/cinn/common/integer_set.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_base.h" +#include "paddle/cinn/ir/ir_mutator.h" + +namespace cinn { +namespace common { + +int64_t GetLargestMutiplyPart(const Expr& expr) { + switch (expr.node_type()) { + case cinn::ir::IrNodeTy::_Var_: + return 1; + case cinn::ir::IrNodeTy::Div: { + auto binExpr = expr.As(); + auto rhs = binExpr->b(); + if (rhs.type().is_index_type()) { + int64_t lhsDiv = GetLargestMutiplyPart(binExpr->a()); + int64_t rhsDiv = GetLargestMutiplyPart(binExpr->b()); + if (lhsDiv % rhsDiv == 0) return std::abs(lhsDiv / rhsDiv); + } + return 1; + } + case cinn::ir::IrNodeTy::IntImm: { + auto int_imm = expr.As(); + return std::abs(int_imm->value); + } + case cinn::ir::IrNodeTy::Mul: { + auto binExpr = expr.As(); + return GetLargestMutiplyPart(binExpr->a()) * + GetLargestMutiplyPart(binExpr->b()); + } + case cinn::ir::IrNodeTy::Sub: + [[fallthrough]]; + case cinn::ir::IrNodeTy::Add: + [[fallthrough]]; + case cinn::ir::IrNodeTy::Mod: { + return std::gcd(GetLargestMutiplyPart(expr.ptr()->operand(0)), + GetLargestMutiplyPart(expr.ptr()->operand(1))); + } + } + PADDLE_THROW(::common::errors::Unimplemented("Unsupported type of expr: %s", + expr.type())); +} + +bool IsIterExpr(const Expr& a, const Expr& b) { + return a.As() || a.As() || + b.As() || b.As(); +} + +bool IsOne(const Expr& expr) { + if (expr.is_constant() && expr.get_constant() == 1) { + return true; + } + return false; +} +bool IsZero(const Expr& expr) { + if (expr.is_constant() && expr.get_constant() == 0) { + return true; + } + return false; +} +bool ProveDivisible(const Expr& lhs, + const Expr& rhs, + const common::SymbolicExprAnalyzer& analyzer) { + if (auto rhs_imm = rhs.As()) { + return GetLargestMutiplyPart(lhs) % rhs_imm->value == 0; + } else if (rhs.is_var()) { + return analyzer.ProveDivisible(lhs, rhs).value_or(false); + } else { + return false; + } +} + +bool ProveEQ(const Expr& lhs, + const Expr& rhs, + const common::SymbolicExprAnalyzer& analyzer) { + if (lhs == rhs) return true; + return analyzer.ProveEQ(lhs, rhs).value_or(false); +} + +bool ProveLE(const Expr& lhs, + const Expr& rhs, + const common::SymbolicExprAnalyzer& analyzer) { + if (lhs == rhs) return true; + return analyzer.ProveLE(lhs, rhs).value_or(false); +} + +} // namespace common +} // namespace cinn diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc index 94158699ae93a..cd70d7177cd19 100644 --- a/paddle/cinn/ir/ir.cc +++ b/paddle/cinn/ir/ir.cc @@ -320,7 +320,7 @@ void _Var_::Verify() const { "A valid name is required to identify the variable.")); } -Expr IterMark::Make(const Expr &source, const Expr &extent) { +Expr IterMark::Make(const Expr &source, const IndexExpr &extent) { auto *n = make_shared(); n->source = source; n->extent = extent; @@ -335,9 +335,9 @@ IterMark &IterMark::operator=(const IterMark &other) { return *this; } Expr IterSplit::Make(const Expr &source, - const Expr &lower_factor, - const Expr &extent, - const Expr &scale) { + const IndexExpr &lower_factor, + const IndexExpr &extent, + const IndexExpr &scale) { auto *n = make_shared(); n->set_type(source.type()); n->source = source; @@ -353,18 +353,18 @@ Expr IterSplit::Make(const Expr &source) { n->set_type(source.type()); n->source = source; n->extent = source_mark->extent; - n->lower_factor = One(source.type()); - n->scale = One(source.type()); + n->lower_factor = One(source.type()).as_index(); + n->scale = One(source.type()).as_index(); return Expr(n); } -Expr IterSplit::Make(const Expr &source, const Expr &scale) { +Expr IterSplit::Make(const Expr &source, const IndexExpr &scale) { auto *n = make_shared(); auto source_mark = source.As(); n->set_type(source.type()); n->source = source; n->extent = source_mark->extent; - n->lower_factor = One(source.type()); + n->lower_factor = One(source.type()).as_index(); n->scale = scale; return Expr(n); } @@ -377,7 +377,7 @@ IterSplit &IterSplit::operator=(const IterSplit &other) { return *this; } -Expr IterSum::Make(const std::vector &args, const Expr &base) { +Expr IterSum::Make(const std::vector &args, const IndexExpr &base) { auto *n = make_shared(); n->set_type(base.type()); n->args = std::move(args); diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h index 96d5225726c1e..946c51cf4c1a4 100644 --- a/paddle/cinn/ir/ir.h +++ b/paddle/cinn/ir/ir.h @@ -1050,10 +1050,10 @@ struct IterMark : public ExprNode { } IterMark& operator=(const IterMark& other); - static Expr Make(const Expr& source, const Expr& extent); + static Expr Make(const Expr& source, const IndexExpr& extent); Type type() const { return source.type(); } Expr source; - Expr extent; + IndexExpr extent; static const IrNodeTy _node_type_ = IrNodeTy::IterMark; }; @@ -1075,17 +1075,17 @@ struct IterSplit : public ExprNode { IterSplit& operator=(const IterSplit& other); static Expr Make(const Expr& source, - const Expr& lower_factor, - const Expr& extent, - const Expr& scale); - static Expr Make(const Expr& source, const Expr& scale); + const IndexExpr& lower_factor, + const IndexExpr& extent, + const IndexExpr& scale); + static Expr Make(const Expr& source, const IndexExpr& scale); static Expr Make(const Expr& source); Type type() const { return source.type(); } Expr source; - Expr lower_factor; - Expr extent; - Expr scale; + IndexExpr lower_factor; + IndexExpr extent; + IndexExpr scale; static const IrNodeTy _node_type_ = IrNodeTy::IterSplit; }; @@ -1096,10 +1096,10 @@ struct IterSplit : public ExprNode { struct IterSum : public ExprNode { public: IterSum() = default; - static Expr Make(const std::vector& args, const Expr& base); + static Expr Make(const std::vector& args, const IndexExpr& base); Type type() const { return base.type(); } std::vector args; - Expr base; + IndexExpr base; static const IrNodeTy _node_type_ = IrNodeTy::IterSum; }; diff --git a/test/cpp/pir/cinn/adt/CMakeLists.txt b/test/cpp/pir/cinn/adt/CMakeLists.txt index 49d5c294663ca..2e3b0af1461d8 100644 --- a/test/cpp/pir/cinn/adt/CMakeLists.txt +++ b/test/cpp/pir/cinn/adt/CMakeLists.txt @@ -1,7 +1,7 @@ if(WITH_TESTING AND WITH_CINN) paddle_test(map_expr_test SRCS map_expr_test.cc) set_tests_properties(map_expr_test PROPERTIES LABELS "RUN_TYPE=CINN") - paddle_test(test_index_expr SRCS iter_simplify_test.cc) + paddle_test(test_index_expr SRCS index_expr_test.cc) paddle_test(test_iter_simplify SRCS iter_simplify_test.cc) paddle_test(merge_block_utils_test SRCS merge_block_utils_test.cc) endif() diff --git a/test/cpp/pir/cinn/adt/Index_expr_test.cc b/test/cpp/pir/cinn/adt/index_expr_test.cc similarity index 92% rename from test/cpp/pir/cinn/adt/Index_expr_test.cc rename to test/cpp/pir/cinn/adt/index_expr_test.cc index 02f682aa1140a..19798f338157f 100644 --- a/test/cpp/pir/cinn/adt/Index_expr_test.cc +++ b/test/cpp/pir/cinn/adt/index_expr_test.cc @@ -14,6 +14,9 @@ #include #include +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_base.h" +#include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/op/ir_operators.h" namespace cinn { diff --git a/test/cpp/pir/cinn/adt/iter_simplify_test.cc b/test/cpp/pir/cinn/adt/iter_simplify_test.cc index 0c94fa2473a3f..c485c602e3d0d 100644 --- a/test/cpp/pir/cinn/adt/iter_simplify_test.cc +++ b/test/cpp/pir/cinn/adt/iter_simplify_test.cc @@ -21,11 +21,18 @@ namespace cinn { namespace common { -#define ITER_MARK(var) ir::IterMark::Make(Expr(var.ptr()), var->upper_bound) +#define ITER_MARK_VAR(var) \ + ir::IterMark::Make(ir::IndexExpr(var.ptr()), var->upper_bound) +#define ITER_MARK_SUM(sum, ext) ir::IterMark::Make(sum, ext) #define ITER_SPLIT(mark, ...) ir::IterSplit::Make(mark, ##__VA_ARGS__) -#define ITER_SUM(...) ir::IterSum::Make({__VA_ARGS__}, Expr(0)) -#define ITER_SUM_WITH_BASE(base, ...) \ - ir::IterSum::Make({__VA_ARGS__}, Expr(base)) +#define ITER_SUM(...) ir::IterSum::Make({__VA_ARGS__}, ir::IndexExpr(0)) +#define ITER_SUM_WITH_BASE(base, ...) ir::IterSum::Make({__VA_ARGS__}, base) + +#define TEST_EXPR(expr, expected, expr_norm) \ + rewriter.Rewrite(&expr); \ + EXPECT_EQ(expr, Expr(expected)); \ + normalizer.Convert(&expr); \ + EXPECT_EQ(expr, expr_norm); class TestIterSimplify : public ::testing::Test { public: @@ -33,28 +40,36 @@ class TestIterSimplify : public ::testing::Test { i = ir::Var(ir::Expr(0), ir::Expr(2), "i"); j = ir::Var(ir::Expr(0), ir::Expr(4), "j"); k = ir::Var(ir::Expr(0), ir::Expr(8), "k"); - var_intervals = {{"i", CasInterval(i->lower_bound, i->upper_bound)}, - {"j", CasInterval(j->lower_bound, j->upper_bound)}, - {"k", CasInterval(k->lower_bound, k->upper_bound)}}; - } + i_j_k_fused = ir::Var(ir::Expr(0), ir::Expr(64), "i_j_k_fused"); + var_intervals = { + {"i", CasInterval(i->lower_bound, i->upper_bound)}, + {"j", CasInterval(j->lower_bound, j->upper_bound)}, + {"k", CasInterval(k->lower_bound, k->upper_bound)}, + {"i_j_k_fused", + CasInterval(i_j_k_fused->lower_bound, i_j_k_fused->upper_bound)}}; + }; ir::Var i; ir::Var j; ir::Var k; + ir::Var i_j_k_fused; cas_intervals_t var_intervals; SymbolicExprAnalyzer analyzer{var_intervals}; }; TEST_F(TestIterSimplify, IterExprMake) { // IterMark Make func. - auto mark_expr = ITER_MARK(i); - auto mark_expr_ = ITER_MARK(j); + auto mark_expr = ITER_MARK_VAR(i); + auto mark_expr_ = ITER_MARK_VAR(j); // IterSplit Make func. auto split_0_expr = ITER_SPLIT(mark_expr); - auto split_1_expr = ITER_SPLIT(mark_expr, Expr(1)); - auto split_2_expr = ITER_SPLIT(mark_expr, Expr(1), Expr(2), Expr(1)); - auto split_3_expr = ITER_SPLIT(mark_expr, Expr(2), Expr(2), Expr(1)); - auto split_4_expr = ITER_SPLIT(mark_expr_, Expr(1), Expr(2), Expr(1)); + auto split_1_expr = ITER_SPLIT(mark_expr, ir::IndexExpr(1)); + auto split_2_expr = ITER_SPLIT( + mark_expr, ir::IndexExpr(1), ir::IndexExpr(2), ir::IndexExpr(1)); + auto split_3_expr = ITER_SPLIT( + mark_expr, ir::IndexExpr(2), ir::IndexExpr(2), ir::IndexExpr(1)); + auto split_4_expr = ITER_SPLIT( + mark_expr_, ir::IndexExpr(1), ir::IndexExpr(2), ir::IndexExpr(1)); // IterSum Make func. auto sum_expr = ITER_SUM(split_0_expr, split_1_expr, split_2_expr); @@ -64,23 +79,23 @@ TEST_F(TestIterSimplify, IterExprMake) { auto split_2 = split_2_expr.As(); auto sum = sum_expr.As(); - EXPECT_EQ(mark->source, Expr(i.ptr())); - EXPECT_EQ(mark->extent, Expr(2)); + EXPECT_EQ(mark->source, ir::IndexExpr(i.ptr())); + EXPECT_EQ(mark->extent, ir::IndexExpr(2)); EXPECT_EQ(split_0->source, mark_expr); - EXPECT_EQ(split_0->lower_factor, Expr(1)); - EXPECT_EQ(split_0->extent, Expr(2)); - EXPECT_EQ(split_0->scale, Expr(1)); + EXPECT_EQ(split_0->lower_factor, ir::IndexExpr(1)); + EXPECT_EQ(split_0->extent, ir::IndexExpr(2)); + EXPECT_EQ(split_0->scale, ir::IndexExpr(1)); EXPECT_EQ(split_1->source, mark_expr); - EXPECT_EQ(split_1->lower_factor, Expr(1)); - EXPECT_EQ(split_1->extent, Expr(2)); - EXPECT_EQ(split_1->scale, Expr(1)); + EXPECT_EQ(split_1->lower_factor, ir::IndexExpr(1)); + EXPECT_EQ(split_1->extent, ir::IndexExpr(2)); + EXPECT_EQ(split_1->scale, ir::IndexExpr(1)); EXPECT_EQ(split_2->source, mark_expr); - EXPECT_EQ(split_2->lower_factor, Expr(1)); - EXPECT_EQ(split_2->extent, Expr(2)); - EXPECT_EQ(split_2->scale, Expr(1)); + EXPECT_EQ(split_2->lower_factor, ir::IndexExpr(1)); + EXPECT_EQ(split_2->extent, ir::IndexExpr(2)); + EXPECT_EQ(split_2->scale, ir::IndexExpr(1)); EXPECT_EQ(sum->args.size(), 3); EXPECT_EQ(sum->base, Expr(0)); @@ -93,125 +108,248 @@ TEST_F(TestIterSimplify, IterExprMake) { } TEST_F(TestIterSimplify, conversion) { - IterMapRewriter rewriter{{i}}; + IterMapRewriter rewriter{{i}, analyzer}; IterMapToExprNormalizer normalizer{analyzer}; ir::Expr e1 = i; - auto gt = ITER_SUM(ITER_SPLIT(ITER_MARK(i))); - rewriter.Rewrite(&e1); - EXPECT_EQ(e1, gt); - normalizer.Convert(&e1); - EXPECT_EQ(e1, 0 + i); + auto gt = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i))); + TEST_EXPR(e1, gt, e1); } TEST_F(TestIterSimplify, add) { - IterMapRewriter rewriter{{i, j, k}}; + IterMapRewriter rewriter{{i, j, k}, analyzer}; IterMapToExprNormalizer normalizer{analyzer}; - auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK(i)), ITER_SPLIT(ITER_MARK(j))); - auto gt2 = ITER_SUM_WITH_BASE(Expr(0) + Expr(5), - ITER_SPLIT(ITER_MARK(i)), - ITER_SPLIT(ITER_MARK(j)), - ITER_SPLIT(ITER_MARK(k))); - auto gt3 = ITER_SUM(ITER_SPLIT(ITER_MARK(i), Expr(1) + Expr(1))); - auto gt4 = ITER_SUM_WITH_BASE(Expr(12)); + auto gt1 = + ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i)), ITER_SPLIT(ITER_MARK_VAR(j))); + auto gt2 = ITER_SUM_WITH_BASE(ir::IndexExpr(5), + ITER_SPLIT(ITER_MARK_VAR(i)), + ITER_SPLIT(ITER_MARK_VAR(j)), + ITER_SPLIT(ITER_MARK_VAR(k))); + auto gt3 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(2))); + auto gt4 = ITER_SUM_WITH_BASE(ir::IndexExpr(12)); ir::Expr e1 = i + j; ir::Expr e2 = i + j + k + 5; ir::Expr e3 = i + i; ir::Expr e4 = Expr(7) + Expr(5); - rewriter.Rewrite(&e1); - EXPECT_EQ(e1, gt1); - normalizer.Convert(&e1); - EXPECT_EQ(e1, 0 + i + j); - rewriter.Rewrite(&e2); - EXPECT_EQ(e2, gt2); - normalizer.Convert(&e2); - EXPECT_EQ(e2, 0 + i + j + k + (Expr(0) + Expr(5))); - rewriter.Rewrite(&e3); - EXPECT_EQ(e3, gt3); - normalizer.Convert(&e3); - EXPECT_EQ(e3, 0 + i * (Expr(1) + Expr(1))); - rewriter.Rewrite(&e4); - EXPECT_EQ(e4, gt4); - normalizer.Convert(&e4); - EXPECT_EQ(e4, Expr(0) + Expr(12)); + TEST_EXPR(e1, gt1, i + j); + TEST_EXPR(e2, gt2, i + j + k + 5); + TEST_EXPR(e3, gt3, i * 2); + TEST_EXPR(e4, gt4, Expr(12)); } TEST_F(TestIterSimplify, sub) { - IterMapRewriter rewriter{{i, j, k}}; + IterMapRewriter rewriter{{i, j, k}, analyzer}; IterMapToExprNormalizer normalizer{analyzer}; - auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK(i)), - ITER_SPLIT(ITER_MARK(j), Expr(0) - Expr(1))); - auto gt2 = ITER_SUM_WITH_BASE(Expr(0) + Expr(5), - ITER_SPLIT(ITER_MARK(i)), - ITER_SPLIT(ITER_MARK(j)), - ITER_SPLIT(ITER_MARK(k), Expr(0) - Expr(1))); - auto gt3 = ITER_SUM(ITER_SPLIT(ITER_MARK(i), Expr(1) - Expr(1))); - auto gt4 = ITER_SUM_WITH_BASE(Expr(2)); + auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i)), + ITER_SPLIT(ITER_MARK_VAR(j), ir::IndexExpr(-1))); + auto gt2 = + ITER_SUM_WITH_BASE(ir::IndexExpr(5), + ITER_SPLIT(ITER_MARK_VAR(i)), + ITER_SPLIT(ITER_MARK_VAR(j)), + ITER_SPLIT(ITER_MARK_VAR(k), ir::IndexExpr(-1))); + auto gt3 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(0))); + auto gt4 = ITER_SUM_WITH_BASE(ir::IndexExpr(2)); ir::Expr e1 = i - j; ir::Expr e2 = i + j - k + 5; ir::Expr e3 = i - i; ir::Expr e4 = Expr(7) - Expr(5); - - rewriter.Rewrite(&e1); - EXPECT_EQ(e1, gt1); - normalizer.Convert(&e1); - EXPECT_EQ(e1, 0 + i + j * (Expr(0) - Expr(1))); - rewriter.Rewrite(&e2); - EXPECT_EQ(e2, gt2); - normalizer.Convert(&e2); - EXPECT_EQ(e2, 0 + i + j + (k * (Expr(0) - Expr(1))) + (Expr(0) + Expr(5))); - rewriter.Rewrite(&e3); - EXPECT_EQ(e3, gt3); - normalizer.Convert(&e3); - EXPECT_EQ(e3, 0 + i * (Expr(1) - Expr(1))); - rewriter.Rewrite(&e4); - EXPECT_EQ(e4, gt4); - normalizer.Convert(&e4); - EXPECT_EQ(e4, Expr(0) + Expr(2)); + TEST_EXPR(e1, gt1, i + (j * -1)); + TEST_EXPR(e2, gt2, i + j + (k * -1) + 5); + TEST_EXPR(e3, gt3, Expr(0)); + TEST_EXPR(e4, gt4, Expr(2)); } TEST_F(TestIterSimplify, mul) { - IterMapRewriter rewriter{{i, j, k}}; + IterMapRewriter rewriter{{i, j, k}, analyzer}; IterMapToExprNormalizer normalizer{analyzer}; - auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK(i), Expr(1) * Expr(2)), - ITER_SPLIT(ITER_MARK(j))); - auto gt2 = ITER_SUM(ITER_SPLIT(ITER_MARK(i), Expr(1) * Expr(2)), - ITER_SPLIT(ITER_MARK(j), Expr(1) * Expr(2)), - ITER_SPLIT(ITER_MARK(k))); - - auto gt3 = ITER_SUM_WITH_BASE((Expr(0) + Expr(5)) * Expr(2), - ITER_SPLIT(ITER_MARK(i), Expr(1) * Expr(2)), - ITER_SPLIT(ITER_MARK(j), Expr(1) * Expr(2)), - ITER_SPLIT(ITER_MARK(k))); - auto gt4 = ITER_SUM_WITH_BASE(Expr(35)); + auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(2)), + ITER_SPLIT(ITER_MARK_VAR(j))); + auto gt2 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(2)), + ITER_SPLIT(ITER_MARK_VAR(j), ir::IndexExpr(2)), + ITER_SPLIT(ITER_MARK_VAR(k))); + + auto gt3 = ITER_SUM_WITH_BASE(ir::IndexExpr(10), + ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(2)), + ITER_SPLIT(ITER_MARK_VAR(j), ir::IndexExpr(2)), + ITER_SPLIT(ITER_MARK_VAR(k))); + auto gt4 = ITER_SUM_WITH_BASE(ir::IndexExpr(35)); ir::Expr e1 = i * 2 + j; ir::Expr e2 = (i + j) * 2 + k; ir::Expr e3 = (i + j + 5) * 2 + k; ir::Expr e4 = Expr(7) * Expr(5); - rewriter.Rewrite(&e1); - EXPECT_EQ(e1, gt1); - normalizer.Convert(&e1); - EXPECT_EQ(e1, 0 + i * (Expr(1) * Expr(2)) + j); - rewriter.Rewrite(&e2); - EXPECT_EQ(e2, gt2); - normalizer.Convert(&e2); - EXPECT_EQ(e2, - ((0 + (i * (Expr(1) * Expr(2)))) + (j * (Expr(1) * Expr(2)))) + k); - rewriter.Rewrite(&e3); - EXPECT_EQ(e3, gt3); - normalizer.Convert(&e3); - EXPECT_EQ( - e3, - (((0 + (i * (Expr(1) * Expr(2)))) + (j * (Expr(1) * Expr(2)))) + k) + - ((Expr(0) + Expr(5)) * Expr(2))); - rewriter.Rewrite(&e4); - EXPECT_EQ(e4, gt4); - normalizer.Convert(&e4); - EXPECT_EQ(e4, Expr(0) + Expr(35)); + TEST_EXPR(e1, gt1, i * 2 + j); + TEST_EXPR(e2, gt2, i * 2 + j * 2 + k); + TEST_EXPR(e3, gt3, i * 2 + j * 2 + k + 10); + TEST_EXPR(e4, gt4, Expr(35)); +} + +TEST_F(TestIterSimplify, div) { + IterMapRewriter rewriter{{i, j, k, i_j_k_fused}, analyzer}; + IterMapToExprNormalizer normalizer{analyzer}; + auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(8), + ir::IndexExpr(8), + ir::IndexExpr(1))); + auto gt2 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(32), + ir::IndexExpr(2), + ir::IndexExpr(1))); + auto gt3 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused))); + auto gt4 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), ir::IndexExpr(2))); + auto gt5 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(2), + ir::IndexExpr(32), + ir::IndexExpr(1))); + auto gt6 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM_WITH_BASE(ir::IndexExpr(8), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused))), + ir::IndexExpr(72)), + ir::IndexExpr(16), + ir::IndexExpr(5), + ir::IndexExpr(1))); + auto gt7 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM_WITH_BASE(ir::IndexExpr(1), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused))), + ir::IndexExpr(65)), + ir::IndexExpr(2), + ir::IndexExpr(33), + ir::IndexExpr(1))); + auto gt8 = ITER_SUM_WITH_BASE(ir::IndexExpr(2), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(8), + ir::IndexExpr(8), + ir::IndexExpr(1))); + auto gt9 = ITER_SUM_WITH_BASE( + ir::IndexExpr(2), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), ir::IndexExpr(2))); + auto gt10 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM_WITH_BASE(ir::IndexExpr(1), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused))), + ir::IndexExpr(65)), + ir::IndexExpr(8), + ir::IndexExpr(9), + ir::IndexExpr(1))); + auto gt11 = ITER_SUM_WITH_BASE(ir::IndexExpr(3)); + auto gt12 = ITER_SUM_WITH_BASE(ir::IndexExpr(3)); + auto gt13 = ITER_SUM_WITH_BASE(ir::IndexExpr(15)); + auto gt14 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + + ir::Expr e1 = i_j_k_fused / 8; + ir::Expr e2 = i_j_k_fused / 8 / 4; + ir::Expr e3 = i_j_k_fused / 1; + ir::Expr e4 = i_j_k_fused * 16 / 8; + ir::Expr e5 = i_j_k_fused * 8 / 16; + ir::Expr e6 = (i_j_k_fused + 8) / 16; + ir::Expr e7 = (i_j_k_fused * 8 + 8) / 16; + ir::Expr e8 = (i_j_k_fused + 16) / 8; + ir::Expr e9 = (i_j_k_fused * 16 + 16) / 8; + ir::Expr e10 = (i_j_k_fused + 1) / 8; + ir::Expr e11 = Expr(15) / Expr(5); + ir::Expr e12 = Expr(15) / Expr(4); + ir::Expr e13 = Expr(15) / Expr(1); + ir::Expr e14 = Expr(0) / Expr(4); + + TEST_EXPR(e1, gt1, i_j_k_fused / 8); + TEST_EXPR(e2, gt2, i_j_k_fused / 32); + TEST_EXPR(e3, gt3, i_j_k_fused); + + TEST_EXPR(e4, gt4, i_j_k_fused * 2); + TEST_EXPR(e5, gt5, i_j_k_fused / 2); + TEST_EXPR(e6, gt6, (i_j_k_fused + 8) / 16); + TEST_EXPR(e7, gt7, (i_j_k_fused + 1) / 2); + TEST_EXPR(e8, gt8, i_j_k_fused / 8 + 2); + TEST_EXPR(e9, gt9, i_j_k_fused * 2 + 2); + TEST_EXPR(e10, gt10, (i_j_k_fused + 1) / 8); + TEST_EXPR(e11, gt11, Expr(3)); + TEST_EXPR(e12, gt12, Expr(3)); + TEST_EXPR(e13, gt13, Expr(15)); + TEST_EXPR(e14, gt14, Expr(0)); +} + +TEST_F(TestIterSimplify, mod) { + IterMapRewriter rewriter{{i, j, k, i_j_k_fused}, analyzer}; + IterMapToExprNormalizer normalizer{analyzer}; + auto gt1 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(1), + ir::IndexExpr(8), + ir::IndexExpr(1))); + auto gt2 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(8), + ir::IndexExpr(4), + ir::IndexExpr(1))); + auto gt3 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + auto gt4 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + auto gt5 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(1), + ir::IndexExpr(2), + ir::IndexExpr(8))); + auto gt6 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM_WITH_BASE(ir::IndexExpr(8), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused))), + ir::IndexExpr(72)), + ir::IndexExpr(1), + ir::IndexExpr(16), + ir::IndexExpr(1))); + auto gt7 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM_WITH_BASE(ir::IndexExpr(1), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(1), + ir::IndexExpr(64), + ir::IndexExpr(1))), + ir::IndexExpr(65)), + ir::IndexExpr(1), + ir::IndexExpr(2), + ir::IndexExpr(8))); + auto gt8 = ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused), + ir::IndexExpr(1), + ir::IndexExpr(8), + ir::IndexExpr(1))); + auto gt9 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + auto gt10 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM_WITH_BASE(ir::IndexExpr(1), + ITER_SPLIT(ITER_MARK_VAR(i_j_k_fused))), + ir::IndexExpr(65)), + ir::IndexExpr(1), + ir::IndexExpr(8), + ir::IndexExpr(1))); + auto gt11 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + auto gt12 = ITER_SUM_WITH_BASE(ir::IndexExpr(3)); + auto gt13 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + auto gt14 = ITER_SUM_WITH_BASE(ir::IndexExpr(0)); + + ir::Expr e1 = i_j_k_fused % 8; + ir::Expr e2 = i_j_k_fused / 8 % 4; + ir::Expr e3 = i_j_k_fused % 1; + ir::Expr e4 = i_j_k_fused * 16 % 8; + ir::Expr e5 = i_j_k_fused * 8 % 16; + ir::Expr e6 = (i_j_k_fused + 8) % 16; + ir::Expr e7 = (i_j_k_fused * 8 + 8) % 16; + ir::Expr e8 = (i_j_k_fused + 16) % 8; + ir::Expr e9 = (i_j_k_fused * 16 + 16) % 8; + ir::Expr e10 = (i_j_k_fused + 1) % 8; + ir::Expr e11 = Expr(15) % Expr(5); + ir::Expr e12 = Expr(15) % Expr(4); + ir::Expr e13 = Expr(15) % Expr(1); + ir::Expr e14 = Expr(0) % Expr(4); + + TEST_EXPR(e1, gt1, i_j_k_fused % 8); + TEST_EXPR(e2, gt2, i_j_k_fused % 32 / 8); + TEST_EXPR(e3, gt3, Expr(0)); + TEST_EXPR(e4, gt4, Expr(0)); + TEST_EXPR(e5, gt5, i_j_k_fused % 2 * 8); + TEST_EXPR(e6, gt6, (i_j_k_fused + 8) % 16); + TEST_EXPR(e7, gt7, (i_j_k_fused + 1) % 2 * 8); + TEST_EXPR(e8, gt8, i_j_k_fused % 8); + TEST_EXPR(e9, gt9, Expr(0)); + TEST_EXPR(e10, gt10, (i_j_k_fused + 1) % 8); + TEST_EXPR(e11, gt11, Expr(0)); + TEST_EXPR(e12, gt12, Expr(3)); + TEST_EXPR(e13, gt13, Expr(0)); + TEST_EXPR(e14, gt14, Expr(0)); } } // namespace common From b89eb26ae41e4b06bb1a5716eb381d1841a927f8 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Mon, 14 Oct 2024 14:10:45 +0800 Subject: [PATCH 110/135] =?UTF-8?q?=E3=80=90prim=20=E3=80=91combine=20dyna?= =?UTF-8?q?mic=20shape=20branch=20with=20part=20of=20static=20branch=20(#6?= =?UTF-8?q?8629)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modify old ir put_along_axis api * upate * modify paddletest * modify fetch input name * combine reduce_as branch of dynamic and staic shape * combine reduce_as branch of dynamic and staic shape * modify divice_grad as phi kernel * delete program check --------- Co-authored-by: phlrain --- paddle/fluid/primitive/rule/vjp/details.h | 237 +++++----------------- test/prim/pir_prim/test_vjp_prim.py | 25 +-- 2 files changed, 57 insertions(+), 205 deletions(-) diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index f39536f599b23..f3cc34a9d1f0d 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -135,40 +135,24 @@ void divide_grad(const Tensor& x, Tensor* dy) { if (dy) { // dy = -(x/y^2) * dout - auto dy_res = -(x / (y * y)) * out_grad; - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + auto dy_res = -out_grad * (x / y / y); + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != y.dims()) { auto dy_tmp = reduce_as(dy_res, y); set_output(dy_tmp, dy); } else { - if (out_grad.dims() != y.dims()) { - phi::DDim reduce_dim = - get_reduce_dims_from_out(out_grad.dims(), y.dims()); - auto dy_reduce_res = - sum(dy_res, common::vectorize(reduce_dim), y.dtype(), false); - auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); - set_output(dy_tmp, dy); - } else { - set_output(dy_res, dy); - } + set_output(dy_res, dy); } } // indicate we will compute dy if (dx) { // dx = (1/y) * dout - Tensor one_tensor = full_scalar(1.0, y.dtype()); - auto dx_res = one_tensor / y * out_grad; - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + auto dx_res = out_grad / y; + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != x.dims()) { auto dx_tmp = reduce_as(dx_res, x); set_output(dx_tmp, dx); } else { - if (out_grad.dims() != x.dims()) { - auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto dx_reduce_res = - sum(dx_res, common::vectorize(reduce_dim), x.dtype(), false); - auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); - set_output(dx_tmp, dx); - } else { - set_output(dx_res, dx); - } + set_output(dx_res, dx); } } // indicate we will compute dx } @@ -601,37 +585,22 @@ void add_grad(const Tensor& x, Tensor* dx, Tensor* dy) { if (dy) { - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != y.dims()) { auto dy_tmp = reduce_as(out_grad, y); set_output(dy_tmp, dy); } else { - if (out_grad.dims() != y.dims()) { - phi::DDim reduce_dim = - get_reduce_dims_from_out(out_grad.dims(), y.dims()); - auto dy_reduce_res = - out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false); - auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); - set_output(dy_tmp, dy); - } else { - by_pass(out_grad, dy); - } + by_pass(out_grad, dy); } } if (dx) { - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != x.dims()) { auto dx_tmp = reduce_as(out_grad, x); set_output(dx_tmp, dx); } else { - if (out_grad.dims() != x.dims()) { - auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto dx_reduce_res = - out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false); - auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); - set_output(dx_tmp, dx); - } else { - by_pass(out_grad, dx); - } + by_pass(out_grad, dx); } } } @@ -645,36 +614,21 @@ void subtract_grad(const Tensor& x, Tensor* dy) { if (dy) { auto scale_out_grad = scale(out_grad, -1.0, 0.0, true); - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != y.dims()) { auto dy_tmp = reduce_as(scale_out_grad, y); set_output(dy_tmp, dy); } else { - if (out_grad.dims() != y.dims()) { - phi::DDim reduce_dim = - get_reduce_dims_from_out(out_grad.dims(), y.dims()); - auto dy_reduce_res = - scale_out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false); - auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); - set_output(dy_tmp, dy); - } else { - by_pass(scale_out_grad, dy); - } + set_output(scale_out_grad, dy); } } if (dx) { - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != x.dims()) { auto dx_tmp = reduce_as(out_grad, x); set_output(dx_tmp, dx); } else { - if (out_grad.dims() != x.dims()) { - auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto dx_reduce_res = - out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false); - auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); - set_output(dx_tmp, dx); - } else { - by_pass(out_grad, dx); - } + by_pass(out_grad, dx); } } } @@ -689,41 +643,23 @@ void multiply_grad(const Tensor& x, if (x_grad) { auto x_grad_unreduce = out_grad * y; if (has_dynamic_shape(x.shape()) || - has_dynamic_shape(x_grad_unreduce.shape())) { + has_dynamic_shape(x_grad_unreduce.shape()) || + x_grad_unreduce.dims() != x.dims()) { auto x_grad_reduced = reduce_as(x_grad_unreduce, x); set_output(x_grad_reduced, x_grad); } else { - if (x_grad_unreduce.dims() != x.dims()) { - auto axes = get_reduce_dims_from_out(x_grad_unreduce.dims(), x.dims()); - auto x_grad_reduced = x_grad_unreduce.sum( - common::vectorize(axes), x_grad_unreduce.dtype(), false); - if (x_grad_reduced.dims().size() != x.dims().size()) { - x_grad_reduced = reshape(x_grad_reduced, x.shape()); - } - set_output(x_grad_reduced, x_grad); - } else { - set_output(x_grad_unreduce, x_grad); - } + set_output(x_grad_unreduce, x_grad); } } if (y_grad) { auto y_grad_unreduce = out_grad * x; if (has_dynamic_shape(y.shape()) || - has_dynamic_shape(y_grad_unreduce.shape())) { + has_dynamic_shape(y_grad_unreduce.shape()) || + y_grad_unreduce.dims() != y.dims()) { auto y_grad_reduced = reduce_as(y_grad_unreduce, y); set_output(y_grad_reduced, y_grad); } else { - if (y_grad_unreduce.dims() != y.dims()) { - auto axes = get_reduce_dims_from_out(y_grad_unreduce.dims(), y.dims()); - auto y_grad_reduced = y_grad_unreduce.sum( - common::vectorize(axes), y_grad_unreduce.dtype(), false); - if (y_grad_reduced.dims().size() != y.dims().size()) { - y_grad_reduced = reshape(y_grad_reduced, y.shape()); - } - set_output(y_grad_reduced, y_grad); - } else { - set_output(y_grad_unreduce, y_grad); - } + set_output(y_grad_unreduce, y_grad); } } } @@ -739,20 +675,12 @@ void elementwise_pow_grad(const Tensor& x, auto lnx = log(x); auto x_pow_y = elementwise_pow(x, y); auto dy_res = lnx * x_pow_y * out_grad; - if (has_dynamic_shape(out_grad.shape()) || has_dynamic_shape(y.shape())) { + if (has_dynamic_shape(out_grad.shape()) || has_dynamic_shape(y.shape()) || + out_grad.dims() != y.dims()) { auto dy_reduce_res = reduce_as(dy_res, y); set_output(dy_reduce_res, dy); } else { - if (out_grad.dims() != y.dims()) { - phi::DDim reduce_dim = - get_reduce_dims_from_out(out_grad.dims(), y.dims()); - auto dy_reduce_res = - dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false); - auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); - set_output(dy_tmp, dy); - } else { - set_output(dy_res, dy); - } + set_output(dy_res, dy); } } // indicate we will compute dy if (dx) { @@ -768,11 +696,8 @@ void elementwise_pow_grad(const Tensor& x, auto x_pow_z = elementwise_pow(x, tmp_z); auto dx_res = y * x_pow_z * out_grad; if (out_grad.dims() != x.dims()) { - auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto dx_reduce_res = - dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false); - auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); - set_output(dx_tmp, dx); + auto dx_reduce_res = reduce_as(dx_res, x); + set_output(dx_reduce_res, dx); } else { set_output(dx_res, dx); } @@ -1019,20 +944,12 @@ void expand_grad(const Tensor& x, const IntArray& shape, Tensor* x_grad) { if (x_grad) { - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != x.dims()) { auto reduced = reduce_as(out_grad, x); set_output(reduced, x_grad); } else { - if (out_grad.dims() != x.dims()) { - auto axes = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto reduced = out_grad.sum(common::vectorize(axes), x.dtype(), false); - if (reduced.dims().size() != x.dims().size()) { - reduced = reshape(reduced, x.shape()); - } - set_output(reduced, x_grad); - } else { - by_pass(out_grad, x_grad); - } + by_pass(out_grad, x_grad); } } } @@ -1220,21 +1137,13 @@ void matmul_grad(const Tensor& x, x_grad_trans = transpose(x_grad_mm, reverse_perm); } if (has_dynamic_shape(x.shape()) || - has_dynamic_shape(x_grad_trans.shape())) { + has_dynamic_shape(x_grad_trans.shape()) || + x_grad_trans.dims() != x.dims()) { auto x_grad_out = reduce_as(x_grad_trans, temp_x_unsqueeze); set_output(x_grad_out, x_grad); } else { - if (x_grad_trans.dims() != x.dims()) { - phi::DDim x_reduce_dim = get_reduce_dims_from_out( - x_grad_trans.dims(), temp_x_unsqueeze.dims()); - auto dx_reduce_res = sum( - x_grad_trans, common::vectorize(x_reduce_dim), x.dtype(), false); - auto x_grad_out = reshape(dx_reduce_res, x.shape()); - set_output(x_grad_out, x_grad); - } else { - auto x_grad_out = x_grad_trans; - set_output(x_grad_out, x_grad); - } + auto x_grad_out = x_grad_trans; + set_output(x_grad_out, x_grad); } } @@ -1253,21 +1162,13 @@ void matmul_grad(const Tensor& x, y_grad_trans = transpose(y_grad_mm, reverse_perm); } if (has_dynamic_shape(y.shape()) || - has_dynamic_shape(y_grad_trans.shape())) { + has_dynamic_shape(y_grad_trans.shape()) || + y_grad_trans.dims() != y.dims()) { auto y_grad_out = reduce_as(y_grad_trans, temp_y_unsqueeze); set_output(y_grad_out, y_grad); } else { - if (y_grad_trans.dims() != y.dims()) { - phi::DDim y_reduce_dim = get_reduce_dims_from_out( - y_grad_trans.dims(), temp_y_unsqueeze.dims()); - auto dy_reduce_res = sum( - y_grad_trans, common::vectorize(y_reduce_dim), y.dtype(), false); - auto y_grad_out = reshape(dy_reduce_res, y.shape()); - set_output(y_grad_out, y_grad); - } else { - auto y_grad_out = y_grad_trans; - set_output(y_grad_out, y_grad); - } + auto y_grad_out = y_grad_trans; + set_output(y_grad_out, y_grad); } } } @@ -1281,39 +1182,24 @@ void maximum_grad(const Tensor& x, if (x_grad) { auto x_tmp = cast(greater_than(x, y), out_grad.dtype()); auto dx_res = out_grad * x_tmp; - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != x.dims()) { auto dx_reduce_res = reduce_as(dx_res, x); set_output(dx_reduce_res, x_grad); } else { - if (out_grad.dims() != x.dims()) { - auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto dx_reduce_res = - dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false); - auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); - set_output(dx_tmp, x_grad); - } else { - set_output(dx_res, x_grad); - } + set_output(dx_res, x_grad); } } if (y_grad) { auto y_tmp = cast(less_equal(x, y), out_grad.dtype()); auto dy_res = out_grad * y_tmp; - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != y.dims()) { auto dy_reduce_res = reduce_as(dy_res, y); set_output(dy_reduce_res, y_grad); } else { - if (out_grad.dims() != y.dims()) { - phi::DDim reduce_dim = - get_reduce_dims_from_out(out_grad.dims(), y.dims()); - auto dy_reduce_res = - dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false); - auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); - set_output(dy_tmp, y_grad); - } else { - set_output(dy_res, y_grad); - } + set_output(dy_res, y_grad); } } } @@ -2241,39 +2127,24 @@ void minimum_grad(const Tensor& x, if (x_grad) { auto x_tmp = cast(less_than(x, y), out_grad.dtype()); auto dx_res = out_grad * x_tmp; - if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(x.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != x.dims()) { auto dx_reduce_res = reduce_as(dx_res, x); set_output(dx_reduce_res, x_grad); } else { - if (out_grad.dims() != x.dims()) { - auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims()); - auto dx_reduce_res = - dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false); - auto dx_tmp = reshape(dx_reduce_res, common::vectorize(x.dims())); - set_output(dx_tmp, x_grad); - } else { - set_output(dx_res, x_grad); - } + set_output(dx_res, x_grad); } } if (y_grad) { auto y_tmp = cast(greater_equal(x, y), out_grad.dtype()); auto dy_res = out_grad * y_tmp; - if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape())) { + if (has_dynamic_shape(y.shape()) || has_dynamic_shape(out_grad.shape()) || + out_grad.dims() != y.dims()) { auto dy_reduce_res = reduce_as(dy_res, y); set_output(dy_reduce_res, y_grad); } else { - if (out_grad.dims() != y.dims()) { - phi::DDim reduce_dim = - get_reduce_dims_from_out(out_grad.dims(), y.dims()); - auto dy_reduce_res = - dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false); - auto dy_tmp = reshape(dy_reduce_res, common::vectorize(y.dims())); - set_output(dy_tmp, y_grad); - } else { - set_output(dy_res, y_grad); - } + set_output(dy_res, y_grad); } } } diff --git a/test/prim/pir_prim/test_vjp_prim.py b/test/prim/pir_prim/test_vjp_prim.py index b12adf45bb5a9..288ff11c8e26e 100644 --- a/test/prim/pir_prim/test_vjp_prim.py +++ b/test/prim/pir_prim/test_vjp_prim.py @@ -84,32 +84,13 @@ def test_divide_grad_prim_case1(self): out_grads, stop_gradients, ) + print(pir_program) reshape_op2 = pir_program.global_block().ops[-1] - reshape_op1 = pir_program.global_block().ops[-4] + reshape_op1 = pir_program.global_block().ops[-2] self.assertEqual(len(grad_outs), 2) - self.assertEqual(len(pir_program.global_block().ops), 16) + self.assertEqual(len(pir_program.global_block().ops), 11) self.assertTrue(reshape_op2.result(0).is_same(grad_outs[0][0])) self.assertTrue(reshape_op1.result(0).is_same(grad_outs[1][0])) - all_op_names = [ - "pd_op.full", - "pd_op.full", - "pd_op.full", - "pd_op.divide", - "pd_op.multiply", - "pd_op.divide", - "pd_op.full", - "pd_op.scale", - "pd_op.multiply", - "pd_op.full_int_array", - "pd_op.sum", - "pd_op.full_int_array", - "pd_op.reshape", - "pd_op.full", - "pd_op.divide", - "pd_op.multiply", - ] - for idx, op in enumerate(pir_program.global_block().ops): - self.assertEqual(op.name(), all_op_names[idx]) paddle.framework.core._set_prim_backward_enabled(False) def test_divide_grad_no_prim(self): From 2af6ab7c40263080a84240578034e8dae2dd3528 Mon Sep 17 00:00:00 2001 From: Wennie396 <44974020+Wennie396@users.noreply.github.com> Date: Mon, 14 Oct 2024 14:18:13 +0800 Subject: [PATCH 111/135] fix engine._parameter_name_list after fusion pass (#68650) * fix engine._parameter_name_list after fusion pass * fix bug --- .../distributed/auto_parallel/static/engine.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index e6b6ae20a1d02..1af56be629aef 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -718,6 +718,19 @@ def _parallel_pir(self, mode): mode="all", ) + # update self._parameter_name_list after fused_ffn_qkv, otherwise opt stage will not update fused params + for k in self.fused_ffn_qkv.keys(): + for fusion in self.fused_ffn_qkv[k]: + for after_fuse_name, before_fuse_params in fusion.items(): + index = self._parameter_name_list.index( + before_fuse_params[0].name + ) + self._parameter_name_list.insert(index, after_fuse_name) + for before_fuse_param in before_fuse_params: + self._parameter_name_list.remove( + before_fuse_param.name + ) + forward_op_start_idx = 0 backward_op_start_idx = -1 opt_op_start_idx = -1 From 923cc24850f41642f21304f880d55a7d148cee13 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Mon, 14 Oct 2024 14:31:42 +0800 Subject: [PATCH 112/135] disable inthenelse in rearange pass (#68668) --- paddle/cinn/optim/rearrange_load_instruction.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/cinn/optim/rearrange_load_instruction.cc b/paddle/cinn/optim/rearrange_load_instruction.cc index a4ed938f5abde..4f2b83752fb1c 100644 --- a/paddle/cinn/optim/rearrange_load_instruction.cc +++ b/paddle/cinn/optim/rearrange_load_instruction.cc @@ -117,6 +117,7 @@ struct RearrangeLoadInstructionMutator : public ir::IRMutator { void Visit(const ir::Select *op, Expr *expr) override {} void Visit(const ir::Broadcast *op, Expr *expr) override {} + void Visit(const ir::IfThenElse *op, Expr *expr) override {} void replaceBlock(ir::Block *op, int old_let_size, int old_stmts_size) { std::vector new_stmts; From b8df23b6b764816a97cd42652059fb06a58e1d8b Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 14 Oct 2024 16:02:21 +0800 Subject: [PATCH 113/135] Add 12.4 (#66560) * Add cuda12.4 * test=document_fix * test=document_fix * test=document_fix * Fix * test=document_fix * test=document_fix * fix openssl * fix;test=document_fix * test=document_fix * test=document_fix * test=document_fix * Fix 12.4 * Fix 12.4 * test=document_fix * test=document_fix * test=document_fix * test=document_fix * test=document_fix * test=document_fix * test=document_fix * Add 12.6 docker * Update pip * Fix pip dir * Fix pip dir * Fix --- tools/dockerfile/Dockerfile.centos | 23 ++++++++----------- tools/dockerfile/Dockerfile.ubuntu20 | 22 ++++++++---------- tools/dockerfile/build_scripts/build.sh | 7 ++---- tools/dockerfile/build_scripts/build_utils.sh | 2 +- .../dockerfile/build_scripts/install_cudnn.sh | 10 ++++++++ tools/dockerfile/centos7_manylinux.sh | 22 ++++++++++++++++++ tools/dockerfile/ubuntu20_dev.sh | 22 ++++++++++++++++++ 7 files changed, 76 insertions(+), 32 deletions(-) diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos index be2a97b036191..50bea029c3471 100644 --- a/tools/dockerfile/Dockerfile.centos +++ b/tools/dockerfile/Dockerfile.centos @@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ENV LD_LIBRARY_PATH /usr/local/ssl/lib:/opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig -RUN yum install -y bzip2 gettext-devel sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel patch +RUN yum update -y && yum install -y bzip2 gettext-devel sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel libtool xz graphviz wget curl-devel patch perl swig COPY build_scripts /build_scripts RUN bash build_scripts/build.sh #RUN bash build_scripts/install_nccl2.sh @@ -25,7 +25,7 @@ RUN ln -s /usr/local/ssl/include/openssl /usr/include RUN wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \ tar -xvf git-2.17.1.tar.gz && \ cd git-2.17.1 && \ - ./configure --with-openssl --prefix=/usr/local && \ + ./configure --with-openssl CFLAGS="-Dsocklen_t=uint32_t" --prefix=/usr/local && \ make -j8 && make install ENV SSL_CERT_FILE=/opt/_internal/certs.pem @@ -42,11 +42,6 @@ RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.l mkdir /root/gopath/src -# protobuf 3.6.1 -RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \ - tar xzf protobuf-cpp-3.6.1.tar.gz && \ - cd protobuf-3.6.1 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.6.1.tar.gz - RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt @@ -70,14 +65,14 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_i LD_LIBRARY_PATH=/opt/_internal/cpython-3.11.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.11.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.12.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.12.0/bin/pip3 install pre-commit 'ipython==5.3.0' -RUN wget -O /opt/swig-2.0.12.tar.gz --no-check-certificate https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \ - cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz -# ccache 3.7.9 -RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ - tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ - ./configure -prefix=/usr/local/ccache-3.7.9 && \ +# ccache 4.8.2 +RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \ + tar xf ccache-4.8.2.tar.gz && mkdir /usr/local/ccache-4.8.2 && cd ccache-4.8.2 && \ + mkdir build && cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.8.2 .. && \ make -j8 && make install && \ - ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache + ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \ + cd ../../ && rm -rf ccache-4.8.2.tar.gz CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"] diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20 index e32bc50923bda..63f68d03101e4 100644 --- a/tools/dockerfile/Dockerfile.ubuntu20 +++ b/tools/dockerfile/Dockerfile.ubuntu20 @@ -69,30 +69,29 @@ RUN apt-get update && \ rm /usr/bin/python3 && ln -s /usr/bin/python3.9 /usr/bin/python3 WORKDIR /home -RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && unzip setuptools-50.3.2.zip +RUN wget -q https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && unzip setuptools-50.3.2.zip WORKDIR /home/setuptools-50.3.2 RUN python3.9 setup.py build && python3.9 setup.py install && \ python3.8 setup.py build && python3.8 setup.py install WORKDIR /home -RUN wget https://files.pythonhosted.org/packages/ef/cc/93f7213b2ab5ed383f98ce8020e632ef256b406b8569606c3f160ed8e1c9/setuptools-68.2.2.tar.gz && tar xf setuptools-68.2.2.tar.gz +RUN wget -q https://files.pythonhosted.org/packages/ef/cc/93f7213b2ab5ed383f98ce8020e632ef256b406b8569606c3f160ed8e1c9/setuptools-68.2.2.tar.gz && tar xf setuptools-68.2.2.tar.gz WORKDIR /home/setuptools-68.2.2 RUN python3.10 setup.py build && python3.10 setup.py install && \ python3.11 setup.py build && python3.11 setup.py install && \ python3.12 setup.py build && python3.12 setup.py install WORKDIR /home -RUN wget https://files.pythonhosted.org/packages/1f/7f/4da15e07ccd11c84c1ccc8f6e24288d5e76c99441bf80e315b33542db951/pip-23.3.1.tar.gz && tar -zxf pip-23.3.1.tar.gz -WORKDIR pip-23.3.1 -RUN python3.9 setup.py install && \ - python3.8 setup.py install && \ - python3.10 setup.py install && \ - python3.11 setup.py install && \ - python3.12 setup.py install +RUN wget -q https://bootstrap.pypa.io/get-pip.py +RUN python3.9 get-pip.py && \ + python3.8 get-pip.py && \ + python3.10 get-pip.py && \ + python3.11 get-pip.py && \ + python3.12 get-pip.py WORKDIR /home -RUN rm setuptools-50.3.2.zip setuptools-68.2.2.tar.gz pip-23.3.1.tar.gz && \ - rm -r setuptools-50.3.2 setuptools-68.2.2 pip-23.3.1 +RUN rm setuptools-50.3.2.zip setuptools-68.2.2.tar.gz && \ + rm -r setuptools-50.3.2 setuptools-68.2.2 get-pip.py # remove them when apt-get support 2.27 and higher version RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ @@ -151,7 +150,6 @@ RUN pip3.8 --no-cache-dir install pre-commit==2.17.0 && \ COPY ./python/requirements.txt /root/ COPY ./python/unittest_py/requirements.txt /home/ - RUN pip3.8 --no-cache-dir install -r /root/requirements.txt && \ pip3.9 --no-cache-dir install -r /root/requirements.txt && \ pip3.9 --no-cache-dir install -r /home/requirements.txt && \ diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh index 402111b38e163..112cf6e3c4cd9 100644 --- a/tools/dockerfile/build_scripts/build.sh +++ b/tools/dockerfile/build_scripts/build.sh @@ -36,7 +36,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 # Dependencies for compiling Python that we want to remove from # the final image after compiling Python -PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel" +PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel xz-devel libffi-devel" # Libraries that are allowed as part of the manylinux1 profile MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel" @@ -49,11 +49,8 @@ source $MY_DIR/build_utils.sh yum -y install wget curl epel-release # Development tools and libraries -yum -y install bzip2 make git patch unzip bison yasm diffutils \ +yum -y install bzip2 make git patch unzip bison diffutils \ automake which file \ - kernel-devel-`uname -r` \ - devtoolset-2-binutils devtoolset-2-gcc \ - devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \ ${PYTHON_COMPILE_DEPS} # Install more recent version of cmake diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh index 2d5d35754551c..54e2d552b7228 100755 --- a/tools/dockerfile/build_scripts/build_utils.sh +++ b/tools/dockerfile/build_scripts/build_utils.sh @@ -144,7 +144,7 @@ function do_openssl_build { ./config -fPIC --prefix=/usr/local/ssl > /dev/null make > /dev/null make install > /dev/null - + ln -sf /usr/lib64/libcrypto.so.1.1 /usr/local/ssl/lib/libcrypto.so.1.1 } diff --git a/tools/dockerfile/build_scripts/install_cudnn.sh b/tools/dockerfile/build_scripts/install_cudnn.sh index 402122dc205de..10ea4a8f68d5e 100644 --- a/tools/dockerfile/build_scripts/install_cudnn.sh +++ b/tools/dockerfile/build_scripts/install_cudnn.sh @@ -96,4 +96,14 @@ elif [[ "$1" == "cudnn900" ]]; then cp -r lib /usr && cd ../ && \ rm -f cudnn-linux-x86_64-9.0.0.312_cuda12-archive.tar.xz && \ rm -rf cudnn-linux-x86_64-9.0.0.312_cuda12-archive +elif [[ "$1" == "cudnn911" ]]; then + wget -q https://paddle-ci.gz.bcebos.com/cudnn/cudnn-linux-x86_64-9.1.1.17_cuda12-archive.tar.xz --no-check-certificate + tar xJvf cudnn-linux-x86_64-9.1.1.17_cuda12-archive.tar.xz && \ + cd cudnn-linux-x86_64-9.1.1.17_cuda12-archive && \ + cp -r include /usr && \ + mkdir -p /usr/lib/x86_64-linux-gnu && \ + cp -r lib/libcudnn* /usr/lib/x86_64-linux-gnu && \ + cp -r lib /usr && cd ../ && \ + rm -f cudnn-linux-x86_64-9.1.1.17_cuda12-archive.tar.xz && \ + rm -rf cudnn-linux-x86_64-9.1.1.17_cuda12-archive fi diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh index 38c443bde9cb8..f237200a63b51 100755 --- a/tools/dockerfile/centos7_manylinux.sh +++ b/tools/dockerfile/centos7_manylinux.sh @@ -60,6 +60,22 @@ function make_cuda123cudnn900trt8616() { sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp } +function make_cuda124cudnn911trt8616() { + sed 's//12.4.1-cudnn-devel-rockylinux8/g' Dockerfile.rockylinux8 >Dockerfile.tmp + sed -i "s##RUN dnf install -y gcc-toolset-12-gcc* \&\& source /opt/rh/gcc-toolset-12/enable \&\& echo 'source /opt/rh/gcc-toolset-12/enable' >>~/.bashrc \nENV PATH=/opt/rh/gcc-toolset-12/root/usr/bin:/usr/share/Modules/bin:\$PATH \nENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:\$LD_LIBRARY_PATH #g" Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_cudnn.sh cudnn911 \nENV CUDNN_VERSION=9.1.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i "s#build_scripts/install_trt.sh#build_scripts/install_trt.sh trt8616#g" Dockerfile.tmp + sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp +} + +function make_cuda125cudnn911trt8616() { + sed 's//12.5.1-cudnn-devel-rockylinux8/g' Dockerfile.rockylinux8 >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc122 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-12.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-12.2/bin:\$PATH \nRUN bash build_scripts/install_cudnn.sh cudnn911 \nENV CUDNN_VERSION=9.1.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i "s#build_scripts/install_trt.sh#build_scripts/install_trt.sh trt8616#g" Dockerfile.tmp + sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp +} + + function main() { local CMD=$1 case $CMD in @@ -81,6 +97,12 @@ function main() { cuda123cudnn900trt8616) make_cuda123cudnn900trt8616 ;; + cuda124cudnn911trt8616) + make_cuda124cudnn911trt8616 + ;; + cuda125cudnn911trt8616) + make_cuda125cudnn911trt8616 + ;; *) echo "Make dockerfile error, Without this paramet." exit 1 diff --git a/tools/dockerfile/ubuntu20_dev.sh b/tools/dockerfile/ubuntu20_dev.sh index ec9d9d9f97e3f..f829a883cbdc8 100755 --- a/tools/dockerfile/ubuntu20_dev.sh +++ b/tools/dockerfile/ubuntu20_dev.sh @@ -86,6 +86,24 @@ function base_image(){ sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8616#g' ${dockerfile_name} sed -i 's#cudnn841#cudnn900#g' ${dockerfile_name} sed -i 's#CUDNN_VERSION=8.4.1#CUDNN_VERSION=9.0.0#g' ${dockerfile_name} + elif [[ ${ref_CUDA_MAJOR} == "12.4" ]];then + dockerfile_name="Dockerfile-124" + sed "s##nvidia/cuda:12.4.1-devel-ubuntu20.04#g" ./Dockerfile.ubuntu20 >${dockerfile_name} + sed -i "s##ENV LD_LIBRARY_PATH=/usr/local/cuda-12.4/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name} + sed -i 's###g' ${dockerfile_name} + sed -i "s##WORKDIR /usr/bin ENV PATH=/usr/local/gcc-12.0/bin:\$PATH #g" ${dockerfile_name} + sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8616#g' ${dockerfile_name} + sed -i 's#cudnn841#cudnn911#g' ${dockerfile_name} + sed -i 's#CUDNN_VERSION=8.4.1#CUDNN_VERSION=9.1.1#g' ${dockerfile_name} + elif [[ ${ref_CUDA_MAJOR} == "12.6" ]];then + dockerfile_name="Dockerfile-126" + sed "s##nvidia/cuda:12.6.0-devel-ubuntu20.04#g" ./Dockerfile.ubuntu20 >${dockerfile_name} + sed -i "s##ENV LD_LIBRARY_PATH=/usr/local/cuda-12.6/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name} + sed -i 's###g' ${dockerfile_name} + sed -i "s##WORKDIR /usr/bin ENV PATH=/usr/local/gcc-12.0/bin:\$PATH #g" ${dockerfile_name} + sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8616#g' ${dockerfile_name} + sed -i 's#cudnn841#cudnn911#g' ${dockerfile_name} + sed -i 's#CUDNN_VERSION=8.4.1#CUDNN_VERSION=9.1.1#g' ${dockerfile_name} else echo "Dockerfile ERROR!!!" exit 1 @@ -108,3 +126,7 @@ export ref_CUDA_MAJOR=12.0 base_image export ref_CUDA_MAJOR=12.3 base_image +export ref_CUDA_MAJOR=12.4 +base_image +export ref_CUDA_MAJOR=12.6 +base_image From a051a5bfd9b2003e8fc203b0354819c01993a337 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Mon, 14 Oct 2024 16:12:53 +0800 Subject: [PATCH 114/135] [PIR] Add AMP logic for manual API `fused_gemm_epilogue` (#68655) --- .../pir/dialect/operator/ir/manual_api.cc | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc index 537200e3ea1f7..e077f65562803 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/fluid/pir/dialect/operator/ir/manual_api.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/amp_utils.h" #include "paddle/fluid/pir/dialect/distributed/ir/dist_tools.h" #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h" #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h" @@ -279,6 +281,33 @@ std::tuple fused_gemm_epilogue(pir::Value x, bool trans_x, bool trans_y, std::string activation) { + // AMP Logic + if (egr::Controller::Instance().GetCurrentAmpAttrs()->GetAmpLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP: fused_gemm_epilogue"; + auto op_name = phi::TransToFluidOpName("fused_gemm_epilogue"); + paddle::small_vector, egr::kSlotSmallVectorSize> + amp_values_vector = {{x}, {y}, {bias}}; + auto amp_dst_dtype = + paddle::imperative::GetAmpDestDtype(op_name, amp_values_vector); + auto new_x = + paddle::imperative::AmpAutoCast("x", x, amp_dst_dtype, op_name); + auto new_y = + paddle::imperative::AmpAutoCast("y", y, amp_dst_dtype, op_name); + auto new_bias = + paddle::imperative::AmpAutoCast("bias", bias, amp_dst_dtype, op_name); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentAmpAttrs(), + paddle::imperative::AmpLevel::O0); + return paddle::dialect::fused_gemm_epilogue( + new_x, new_y, new_bias, trans_x, trans_y, activation); + } + } + + // Type Promotion Logic + VLOG(5) << " No Type Promotion for fused_gemm_epilogue api. "; pir::IrContext* ctx = pir::IrContext::Instance(); pir::AttributeMap attribute_map = { {"trans_x", pir::BoolAttribute::get(ctx, trans_x)}, @@ -289,6 +318,10 @@ std::tuple fused_gemm_epilogue(pir::Value x, .GetBuilder() ->Build( x, y, bias, attribute_map); + if (!egr::Controller::Instance().HasGrad()) { + SetStopGradient(fused_gemm_epilogue_op.result(0), + fused_gemm_epilogue_op.result(1)); + } return std::make_tuple(fused_gemm_epilogue_op.result(0), fused_gemm_epilogue_op.result(1)); } From 03dca7f4ced2672f984159d517556514509996b1 Mon Sep 17 00:00:00 2001 From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:31:59 +0800 Subject: [PATCH 115/135] [XPU] Modify the condition in the SupportXPU function for XPU. (#68622) --- paddle/fluid/framework/operator.cc | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index adc6dfcf20afc..03e9a0532b78c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1389,15 +1389,19 @@ bool OperatorWithKernel::SupportXPU() const { return false; } else { auto& op_kernels = kernel_iter->second; - return std::any_of(op_kernels.begin(), - op_kernels.end(), - [this](OpKernelMap::const_reference kern_pair) { - return phi::is_xpu_place(kern_pair.first.place_) && - paddle::platform::is_xpu_support_op( - type_, - framework::TransToPhiDataType( - kern_pair.first.data_type_)); - }); + return std::any_of( + op_kernels.begin(), + op_kernels.end(), + [this](OpKernelMap::const_reference kern_pair) { + bool is_xpu_support1 = phi::backends::xpu::is_xpu_support_op( + type_, + framework::TransToPhiDataType(kern_pair.first.data_type_)); + bool is_xpu_support2 = phi::backends::xpu::is_xpu_support_op( + phi::TransToPhiKernelName(type_), + framework::TransToPhiDataType(kern_pair.first.data_type_)); + return phi::is_xpu_place(kern_pair.first.place_) && + (is_xpu_support1 || is_xpu_support2); + }); } } #else From a3a1d3041a345111c9b56256a5b7d4083d81ede7 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 14 Oct 2024 17:21:21 +0800 Subject: [PATCH 116/135] support __format__ for 0-D tensor (#68677) --- .../base/dygraph/tensor_patch_methods.py | 7 +++ test/legacy_test/test_eager_tensor.py | 49 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py index 2e4c75c093911..e6cf73601b440 100644 --- a/python/paddle/base/dygraph/tensor_patch_methods.py +++ b/python/paddle/base/dygraph/tensor_patch_methods.py @@ -913,6 +913,12 @@ def __str__(self: Tensor) -> str: return tensor_to_string(self) + def __format__(self, format_spec): + if self.ndim == 0: + return self.item().__format__(format_spec) + + return object.__format__(self, format_spec) + def __deepcopy__(self, memo: dict[int, Tensor]) -> Tensor: """ Deep copy Tensor, it will always performs Tensor copy. @@ -1341,6 +1347,7 @@ def __cuda_array_interface__(self): ("register_hook", register_hook), ("__str__", __str__), ("__repr__", __str__), + ("__format__", __format__), ("__deepcopy__", __deepcopy__), ("__module__", "paddle"), ("__array__", __array__), diff --git a/test/legacy_test/test_eager_tensor.py b/test/legacy_test/test_eager_tensor.py index c279172d6c0a5..f90a318383376 100644 --- a/test/legacy_test/test_eager_tensor.py +++ b/test/legacy_test/test_eager_tensor.py @@ -1280,6 +1280,55 @@ def test___cuda_array_interface__(self): self.assertIn("version", interface) self.assertEqual(interface["version"], 2) + def test_tensor__format__(self): + # test for floating point scalar + for width in range(0, 5): + paddle_scalar = paddle.randn([]) + numpy_scalar = paddle_scalar.numpy() + format_spec = f".{width}f" + self.assertEqual( + paddle_scalar.__format__(format_spec), + numpy_scalar.__format__(format_spec), + ) + format_spec = f".{width}e" + self.assertEqual( + paddle_scalar.__format__(format_spec), + numpy_scalar.__format__(format_spec), + ) + format_spec = f".{width}g" + self.assertEqual( + paddle_scalar.__format__(format_spec), + numpy_scalar.__format__(format_spec), + ) + + # test for integer scalar + for width in range(0, 5): + paddle_scalar = paddle.uniform([], min=-100, max=100).to("int64") + numpy_scalar = paddle_scalar.numpy() + format_spec = f"{width}d" + self.assertEqual( + paddle_scalar.__format__(format_spec), + numpy_scalar.__format__(format_spec), + ) + format_spec = f"{width}o" + self.assertEqual( + paddle_scalar.__format__(format_spec), + numpy_scalar.__format__(format_spec), + ) + format_spec = f"{width}x" + self.assertEqual( + paddle_scalar.__format__(format_spec), + numpy_scalar.__format__(format_spec), + ) + + # test for tensor that ndim > 0, expected to raise TypeError + paddle_scalar = paddle.uniform([1], min=-100, max=100) + self.assertRaises(TypeError, paddle_scalar.__format__, ".3f") + + # test for float scalar but format_spec is 'd', expected to raise ValueError + paddle_scalar = paddle.uniform([], min=-100, max=100) + self.assertRaises(ValueError, paddle_scalar.__format__, "3d") + class TestEagerTensorSetitem(unittest.TestCase): def func_setUp(self): From ec06c716bbf30c9183a7c5da788d311726836aab Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Mon, 14 Oct 2024 18:58:09 +0800 Subject: [PATCH 117/135] [Dy2St][PIR] Clear no grad edges to avoid reducer error (#68679) --- .../eager/to_static/run_program_op_func.h | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 517b37d95e28b..c4821d20f707a 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -27,33 +27,38 @@ #include "paddle/pir/include/core/value.h" #include "paddle/pir/include/dialect/control_flow/ir/cf_type.h" -// Filter params without grads in global block. In this case, we will -// tag its AutogradMeta with stop_gradient = True to avoid fault from -// reducer while training on multi-cards. -static void clear_no_grad_edges(const std::vector& params, - const paddle::framework::BlockDesc* block_desc, - egr::GradNodeBase* grad_node, - size_t slot_id) { +static void clear_no_grad_edges_with_partial_block( + const std::vector& params, + const paddle::framework::BlockDesc* forward_block_desc, + const paddle::framework::BlockDesc* backward_block_desc, + egr::GradNodeBase* grad_node, + size_t slot_id) { for (size_t i = 0; i < params.size(); ++i) { auto p_grad_name = paddle::framework::GradVarName(params[i].name()); - if (!block_desc->HasVar(p_grad_name)) { + if (!forward_block_desc->HasVar(p_grad_name) && + !backward_block_desc->HasVar(p_grad_name)) { VLOG(3) << "clear edge of " << p_grad_name; grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear(); } } } -static void clear_no_grad_edges_with_partial_block( +static bool IsFakeValue(const pir::Value& value) { + return value.impl() == nullptr || !value.type(); +} + +// Filter params without grads in global block. In this case, we will +// tag its AutogradMeta with stop_gradient = True to avoid fault from +// reducer while training on multi-cards. +static void pir_clear_no_grad_edges( const std::vector& params, - const paddle::framework::BlockDesc* forward_block_desc, - const paddle::framework::BlockDesc* backward_block_desc, + const std::vector& backward_params_grad, + const pir::Block* backward_block, egr::GradNodeBase* grad_node, size_t slot_id) { for (size_t i = 0; i < params.size(); ++i) { - auto p_grad_name = paddle::framework::GradVarName(params[i].name()); - if (!forward_block_desc->HasVar(p_grad_name) && - !backward_block_desc->HasVar(p_grad_name)) { - VLOG(3) << "clear edge of " << p_grad_name; + if (IsFakeValue(backward_params_grad[i])) { + VLOG(3) << "clear edge of " << params[i].name(); grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear(); } } @@ -314,6 +319,9 @@ inline void pir_run_program_ad_func( std::shared_ptr<::pir::Program>, attrs.at("backward_program")); auto forward_outputs = PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo")); + auto backward_params_grad = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bp_g")); + pir_clear_unused_out_var_in_backward( forward_outputs, backward_program->block(), step_scope[0]); @@ -324,13 +332,13 @@ inline void pir_run_program_ad_func( grad_node->SetGradOutMeta(x, /*slot id*/ 0); grad_node->SetGradOutMeta(params, /*slot id*/ 1); - // TODO(@xiongkun): rewrite by new ir representation. - // VLOG(2) << "clear_no_grad_edges."; - // clear_no_grad_edges_with_partial_block(params, - // forward_global_block, - // backward_global_block, - // grad_node.get(), - // [>slot id<] 1); + // Clear no grad edges + VLOG(2) << "clear no grad edges."; + pir_clear_no_grad_edges(params, + backward_params_grad, + backward_program->block(), + grad_node.get(), + /*slot id*/ 1); grad_node->SetGradInMeta(deref_out, 0); From fbc6cf3a7a358df873d3335a00836abe0b74d09c Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 15 Oct 2024 10:11:22 +0800 Subject: [PATCH 118/135] [eager Tensor] Update type hint for `Tensor.__format__` and add unittest (#68694) --- python/paddle/base/dygraph/tensor_patch_methods.py | 2 +- test/legacy_test/test_eager_tensor.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py index e6cf73601b440..9de1dbf78ed2c 100644 --- a/python/paddle/base/dygraph/tensor_patch_methods.py +++ b/python/paddle/base/dygraph/tensor_patch_methods.py @@ -913,7 +913,7 @@ def __str__(self: Tensor) -> str: return tensor_to_string(self) - def __format__(self, format_spec): + def __format__(self, format_spec: str) -> str: if self.ndim == 0: return self.item().__format__(format_spec) diff --git a/test/legacy_test/test_eager_tensor.py b/test/legacy_test/test_eager_tensor.py index f90a318383376..01119c55f6b96 100644 --- a/test/legacy_test/test_eager_tensor.py +++ b/test/legacy_test/test_eager_tensor.py @@ -1301,6 +1301,12 @@ def test_tensor__format__(self): numpy_scalar.__format__(format_spec), ) + format_spec = "{:.{}f}" + self.assertEqual( + format_spec.format(paddle_scalar, width), + format_spec.format(numpy_scalar, width), + ) + # test for integer scalar for width in range(0, 5): paddle_scalar = paddle.uniform([], min=-100, max=100).to("int64") @@ -1321,6 +1327,12 @@ def test_tensor__format__(self): numpy_scalar.__format__(format_spec), ) + format_spec = "{:{}d}" + self.assertEqual( + format_spec.format(paddle_scalar, width), + format_spec.format(numpy_scalar, width), + ) + # test for tensor that ndim > 0, expected to raise TypeError paddle_scalar = paddle.uniform([1], min=-100, max=100) self.assertRaises(TypeError, paddle_scalar.__format__, ".3f") From 3eddd2ae9ce0e4fdc9891d3e4196436f13ad9105 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:18:49 +0800 Subject: [PATCH 119/135] fix slice compute bug (#68698) --- paddle/cinn/hlir/pe/transform.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc index ebfe0076f31ee..e71fb8b109558 100644 --- a/paddle/cinn/hlir/pe/transform.cc +++ b/paddle/cinn/hlir/pe/transform.cc @@ -1342,7 +1342,7 @@ ir::Tensor SliceSymbolic(const ir::Tensor& A, }); for (int i = 0; i < axes.size(); i++) { - if (input_shape[axes[i]].is_constant()) { + if (input_shape[axes[i]].is_constant() && new_starts[i].is_constant()) { if (new_starts[i].as_int64() < -input_shape[axes[i]].as_int64()) { new_starts[i] = ir::Expr(0); } else if (new_starts[i].as_int64() < 0) { From dbd661427ffed173303adc46d38b8b23b1e84e3a Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:39:04 +0800 Subject: [PATCH 120/135] [CINN]optimize global memory read insert pointer (#68667) * optimize global memory read insert pointer * polish code --- .../optim/eliminate_common_global_memory_read.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/paddle/cinn/optim/eliminate_common_global_memory_read.cc b/paddle/cinn/optim/eliminate_common_global_memory_read.cc index f62b9dd653d54..8b7259f759b5f 100644 --- a/paddle/cinn/optim/eliminate_common_global_memory_read.cc +++ b/paddle/cinn/optim/eliminate_common_global_memory_read.cc @@ -335,6 +335,9 @@ struct CommonGlobalMemoryEliminator : public ir::IRMutator { ::common::errors::InvalidArgument( "The input expr should be a ScheduleBlockRealize")); current_sbr_ = node; + if (current_block_) { + insert_block_ = current_block_; + } IRMutator<>::Visit(op, expr); } @@ -387,7 +390,11 @@ struct CommonGlobalMemoryEliminator : public ir::IRMutator { "buffer_name %s should not be in global_buffer_to_local_buffer_", buffer_name)); global_buffer_to_local_buffer_[buffer_name] = new_tensor; - block_to_insert_stmts_[current_block_].push_back(new_sbr); + + PADDLE_ENFORCE_NOT_NULL( + insert_block_, + ::common::errors::InvalidArgument("insert block CAN NOT be nullptr")); + block_to_insert_stmts_[insert_block_].push_back(new_sbr); } void SubstituteGlobalTensor(ir::Load* load_node, @@ -405,7 +412,8 @@ struct CommonGlobalMemoryEliminator : public ir::IRMutator { std::unordered_map global_buffer_to_local_buffer_; std::unordered_map> block_to_insert_stmts_; - ir::Block* current_block_; + ir::Block* current_block_{nullptr}; + ir::Block* insert_block_{nullptr}; ir::ScheduleBlockRealize* current_sbr_; }; From 3a523bf9191fbd8039a06f5a5fba6249aa327940 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:43:44 +0800 Subject: [PATCH 121/135] fix bug (#68647) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * trt<8.2 编译报错 * 冲突 * fix --- .../fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index 1cde0e8630e91..333c76c159c74 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -1406,8 +1406,6 @@ class NearestInterV2Pattern } } - pir::Value size_tensor = op.operand_source(2); - auto data_format = op->attribute("data_format").AsString(); if (data_format != "NCHW" && data_format != "NHWC") { @@ -1420,13 +1418,10 @@ class NearestInterV2Pattern VLOG(3) << "The interp_method of NearestInterV2 is not nearest"; return false; } - bool has_size_input = false; - if (size_tensor) { - has_size_input = true; - } #if IS_TRT_VERSION_GE(8200) - if (has_size_input) { + pir::Value size_tensor = op.operand_source(2); + if (size_tensor) { auto size_tensor_type = size_tensor.type(); if (size_tensor_type.isa()) { auto vector_type = size_tensor.type().dyn_cast(); From 4fc054cd348c72a6b85cd3f2e4c551ed73abaf81 Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:46:01 +0800 Subject: [PATCH 122/135] =?UTF-8?q?=E3=80=90CINN=E3=80=91Add=20TryFuse=20f?= =?UTF-8?q?unc=20for=20IterExpr.=20(#68685)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * tryFuse. * add const for hash * fix bug --- paddle/cinn/common/iter_simplify.cc | 175 ++++++++++++++++++-- paddle/cinn/common/iter_simplify.h | 27 ++- paddle/cinn/common/iter_util.h | 29 ++++ paddle/cinn/ir/ir_base.h | 2 +- test/cpp/pir/cinn/adt/iter_simplify_test.cc | 26 +++ 5 files changed, 236 insertions(+), 23 deletions(-) diff --git a/paddle/cinn/common/iter_simplify.cc b/paddle/cinn/common/iter_simplify.cc index 44f2b7df6e8ae..84d13cdd61f9e 100644 --- a/paddle/cinn/common/iter_simplify.cc +++ b/paddle/cinn/common/iter_simplify.cc @@ -50,7 +50,7 @@ ir::IndexExpr IterMapToExprNormalizer::ConvertIterSum(ir::IterSum* expr) { ir::IndexExpr IterMapToExprNormalizer::ConvertIterSplit(ir::IterSplit* expr) { // quick branch - if (IsZero(expr->scale)) return ir::IndexExpr(0); + if (IsZero(expr->scale) || IsOne(expr->extent)) return ir::IndexExpr(0); ir::IndexExpr source; ir::IterMark* mark = expr->source.As(); if (auto opt = mark->source.As()) { @@ -67,7 +67,7 @@ ir::IndexExpr IterMapToExprNormalizer::ConvertIterSplit(ir::IterSplit* expr) { return source * expr->scale; } else if (ProveLE( mark->extent, expr->lower_factor * expr->extent, analyzer_)) { - if (IsOne(expr->extent) && !IsOne(mark->extent)) { + if (IsOne(expr->extent)) { return ir::Zero(expr->extent.type()); } return source / expr->lower_factor * expr->scale; @@ -242,10 +242,13 @@ Expr IterMapRewriter::PreprocessDividend(const Expr& dividend) { if (sum->args.size() == 1) { return dividend; } - // TODO(liuruyan): number of split in sum is greater then 1, Do `tryFuse` in - // latter. - auto fused = dividend; - return fused; + auto opt_fused = TryFuse(dividend); + if (!opt_fused) { + PADDLE_THROW(::common::errors::InvalidArgument( + "Dividend can't be written as a single fused IterSum")); + return ir::IndexExpr(); + } + return opt_fused.value(); } else { PADDLE_THROW( ::common::errors::InvalidArgument("Expect dividend is IterExpr.")); @@ -253,9 +256,9 @@ Expr IterMapRewriter::PreprocessDividend(const Expr& dividend) { } } -ir::IndexExpr IterMapRewriter::SplitDivConst(ir::IndexExpr lhs_expr, - ir::IndexExpr base, - ir::IndexExpr rhs) { +ir::Expr IterMapRewriter::SplitDivConst(ir::Expr lhs_expr, + ir::IndexExpr base, + ir::IndexExpr rhs) { // (lhs_expr + base) // rhs if (IsOne(rhs)) { if (IsZero(base)) return lhs_expr; @@ -317,13 +320,13 @@ ir::IndexExpr IterMapRewriter::SplitDivConst(ir::IndexExpr lhs_expr, (lhs->extent + rhs - 1) / rhs, ir::One(rhs.type())); } - - return ir::IterSum::Make({new_split}, base / rhs); + return IsZero(base / rhs) ? new_split + : ir::IterSum::Make({new_split}, base / rhs); } -ir::IndexExpr IterMapRewriter::SplitModConst(ir::IndexExpr lhs_expr, - ir::IndexExpr base, - ir::IndexExpr rhs) { +ir::Expr IterMapRewriter::SplitModConst(ir::Expr lhs_expr, + ir::IndexExpr base, + ir::IndexExpr rhs) { // (lhs_expr + base) % rhs if (IsOne(rhs)) { return ir::Zero(lhs_expr.type()); @@ -366,7 +369,149 @@ ir::IndexExpr IterMapRewriter::SplitModConst(ir::IndexExpr lhs_expr, return ir::IterSplit::Make(lhs->source, lhs->lower_factor, rhs, lhs->scale); } -ir::IndexExpr IterMapRewriter::ToIterSum(const Expr& expr) { +int32_t IterMapRewriter::FindFirstPossibleUnitExtentIndex( + const ir::IterSum& expr) { + for (size_t i = 0; i < expr.args.size(); ++i) { + if (IsOne(expr.args[i].As()->extent)) + return static_cast(i); + } + return static_cast(expr.args.size()); +} + +int32_t IterMapRewriter::FindIterWithExactScale( + const ir::IterSum& expr, + const std::vector& skip_flag, + const ir::IndexExpr& expected_scale, + const ir::Expr& match_source, + int32_t rbegin, + int32_t first_possible_unit_extent_pos) { + if (rbegin == -1) { + rbegin = static_cast(expr.args.size()) - 1; + } + int32_t matched_pos = -1; + // use reverse search, as smallest scale usually are near the end. + for (int32_t j = rbegin; j >= 0; --j) { + if (skip_flag[j]) continue; + auto split = expr.args[j].As(); + if (match_source.defined() && match_source != split->source) continue; + const ir::IndexExpr& cur_scale = split->scale; + // for bijective mapping, the matched scale must equal to expected scale + if (ProveEQ(cur_scale, expected_scale, analyzer_)) { + if (IsOne(split->extent)) return j; + if (matched_pos == -1) { + matched_pos = j; + } + if (j <= first_possible_unit_extent_pos) return matched_pos; + } + } + return matched_pos; +} + +int32_t IterMapRewriter::FindBaseIter(const ir::IterSum& expr, + const std::vector& skip_flag, + const ir::Expr& match_source, + int32_t rbegin) { + if (rbegin == -1) { + rbegin = static_cast(expr.args.size()) - 1; + } + + int base_index = -1; + int64_t min_const_scale = 0; + + for (int32_t i = rbegin; i >= 0; --i) { + if (skip_flag[i]) continue; + auto split = expr.args[i].As(); + if (match_source.defined() && match_source != split->source) continue; + if (const auto* op = split->scale.As()) { + if (base_index == -1 || op->value < min_const_scale) { + min_const_scale = op->value; + base_index = static_cast(i); + } else if (op->value == min_const_scale) { + if (IsOne(split->extent) && + !IsOne(expr.args[base_index].As()->extent)) { + base_index = static_cast(i); + } + } + } + } + if (base_index != -1) return base_index; + + int32_t min_reduce_size = 0; + for (int32_t i = rbegin; i >= 0; --i) { + if (skip_flag[i]) continue; + auto split = expr.args[i].As(); + if (match_source.defined() && match_source != split->source) continue; + int32_t reduce_size = 0; + auto fcollect = [&](const ir::IndexExpr&) { ++reduce_size; }; + UnpackReduction(split->scale, fcollect); + if (base_index == -1 || reduce_size < min_reduce_size) { + min_reduce_size = reduce_size; + base_index = static_cast(i); + } + } + return base_index; +} + +std::optional IterMapRewriter::TryFuse(const ir::Expr& expr) { + auto iter_sum = expr.As(); + if (!iter_sum) return std::nullopt; + if (iter_sum->args.size() <= 1) return std::nullopt; + // TODO(liuruyan): fuse iter with same source. + + std::vector visited(iter_sum->args.size(), false); + int base_index = FindBaseIter(*iter_sum, visited, ir::IndexExpr(), -1); + if (base_index == -1) return std::nullopt; + ir::IndexExpr base_scale = + iter_sum->args[base_index].As()->scale; + + std::vector grouped_iters; + + ir::IndexExpr expected_extra_base = ir::Zero(iter_sum->type()); + ir::IndexExpr tail_extent = ir::Zero(iter_sum->type()); + ir::IndexExpr expected_scale = base_scale; + int first_possible_unit_extent_pos = + FindFirstPossibleUnitExtentIndex(*iter_sum); + + for (size_t i = 0; i < iter_sum->args.size(); ++i) { + ir::IndexExpr matched_scale{nullptr}; + int matched_pos = + i == 0 ? base_index + : FindIterWithExactScale(*iter_sum, + visited, + expected_scale, + ir::IndexExpr(), + -1, + first_possible_unit_extent_pos); + if (matched_pos != -1) matched_scale = expected_scale; + + if (matched_pos == -1) return std::nullopt; + + visited[matched_pos] = true; + auto arg_copy = ir::ir_utils::IRCopy(iter_sum->args[matched_pos]); + auto arg = arg_copy.As(); + arg->scale = arg->scale / base_scale; + grouped_iters.push_back(arg_copy); + expected_scale = MulAndNormalize( + iter_sum->args[matched_pos].As()->extent, matched_scale); + } + std::reverse(grouped_iters.begin(), grouped_iters.end()); + Expr grouped_sum = + ir::IterSum::Make(grouped_iters, ir::Zero(iter_sum->type())); + + auto it = sum_fuse_map_.find(grouped_sum); + if (it != sum_fuse_map_.end()) { + return ir::IterSum::Make({ir::IterSplit::Make(it->second, base_scale)}, + iter_sum->base); + } else { + // new iter, form a new mark + auto mark = ir::IterMark::Make(grouped_sum, expected_scale / base_scale); + sum_fuse_map_[grouped_sum] = mark; + return ir::IterSum::Make({ir::IterSplit::Make(mark, base_scale)}, + iter_sum->base); + } +} + +Expr IterMapRewriter::ToIterSum(const Expr& expr) { if (expr.As()) { return expr; } else if (auto split = expr.As()) { diff --git a/paddle/cinn/common/iter_simplify.h b/paddle/cinn/common/iter_simplify.h index cedb366cfa3b7..d34ba0c7ee238 100644 --- a/paddle/cinn/common/iter_simplify.h +++ b/paddle/cinn/common/iter_simplify.h @@ -84,7 +84,7 @@ class IterMapRewriter : public ir::IRMutator<> { void Visit(const ir::Mod* op, Expr* expr) override; private: - static ir::IndexExpr ToIterSum(const Expr& expr); + static Expr ToIterSum(const Expr& expr); static void AddToLhs(ir::IterSum* lhs, const ir::IterSplit& rhs, int sign); @@ -94,16 +94,29 @@ class IterMapRewriter : public ir::IRMutator<> { Expr PreprocessDividend(const Expr& dividend); - ir::IndexExpr SplitDivConst(ir::IndexExpr lhs, - ir::IndexExpr base, - ir::IndexExpr rhs); + Expr SplitDivConst(Expr lhs, ir::IndexExpr base, ir::IndexExpr rhs); - ir::IndexExpr SplitModConst(ir::IndexExpr lhs, - ir::IndexExpr base, - ir::IndexExpr rhs); + Expr SplitModConst(Expr lhs, ir::IndexExpr base, ir::IndexExpr rhs); + + int32_t FindIterWithExactScale(const ir::IterSum& expr, + const std::vector& skip_flag, + const ir::IndexExpr& expected_scale, + const Expr& match_source, + int32_t rbegin = -1, + int32_t first_possible_unit_extent_pos = 0); + + int32_t FindFirstPossibleUnitExtentIndex(const ir::IterSum& expr); + + int32_t FindBaseIter(const ir::IterSum& expr, + const std::vector& skip_flag, + const Expr& match_source, + int32_t rbegin = -1); + + std::optional TryFuse(const Expr& expr); std::unordered_map var_map_; std::vector input_marks_; + std::unordered_map sum_fuse_map_; common::SymbolicExprAnalyzer analyzer_; }; diff --git a/paddle/cinn/common/iter_util.h b/paddle/cinn/common/iter_util.h index 2d4e8f50c0387..408487dfe754c 100644 --- a/paddle/cinn/common/iter_util.h +++ b/paddle/cinn/common/iter_util.h @@ -103,5 +103,34 @@ bool ProveLE(const Expr& lhs, return analyzer.ProveLE(lhs, rhs).value_or(false); } +template +inline void UnpackReduction(const ir::IndexExpr& value, FLeaf fleaf) { + if (const TNode* node = value.As()) { + UnpackReduction(node->a(), fleaf); + UnpackReduction(node->b(), fleaf); + } else { + fleaf(value); + } +} + +// TODO(liuruyan): canby simplify into IndexExpr multiply. +inline ir::IndexExpr MulAndNormalize(const ir::IndexExpr& lhs, + const ir::IndexExpr& rhs) { + int64_t cscale = 1; + ir::IndexExpr res = ir::One(lhs.type()); + auto fcollect = [&](ir::IndexExpr val) { + if (const auto* intimm = val.As()) { + cscale *= intimm->value; + } else { + res = res * val; + } + }; + UnpackReduction(lhs, fcollect); + UnpackReduction(rhs, fcollect); + if (cscale != 1) { + res = res * ir::IndexExpr(make_shared(res.type(), cscale)); + } + return res; +} } // namespace common } // namespace cinn diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h index 48ba075929b2c..e4cdf152ff23f 100644 --- a/paddle/cinn/ir/ir_base.h +++ b/paddle/cinn/ir/ir_base.h @@ -597,7 +597,7 @@ namespace std { template <> struct hash { - size_t operator()(const cinn::ir::Expr& x) { + size_t operator()(const cinn::ir::Expr& x) const { return reinterpret_cast(x.get()); } }; diff --git a/test/cpp/pir/cinn/adt/iter_simplify_test.cc b/test/cpp/pir/cinn/adt/iter_simplify_test.cc index c485c602e3d0d..907abc8b02a03 100644 --- a/test/cpp/pir/cinn/adt/iter_simplify_test.cc +++ b/test/cpp/pir/cinn/adt/iter_simplify_test.cc @@ -352,5 +352,31 @@ TEST_F(TestIterSimplify, mod) { TEST_EXPR(e14, gt14, Expr(0)); } +TEST_F(TestIterSimplify, fuse_not_same_source) { + IterMapRewriter rewriter{{i, j, k, i_j_k_fused}, analyzer}; + IterMapToExprNormalizer normalizer{analyzer}; + + auto gt1 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(32)), + ITER_SPLIT(ITER_MARK_VAR(j), ir::IndexExpr(8)), + ITER_SPLIT(ITER_MARK_VAR(k), ir::IndexExpr(1))), + ir::IndexExpr(64)), + ir::IndexExpr(8), + ir::IndexExpr(8), + ir::IndexExpr(1))); + auto gt2 = ITER_SUM(ITER_SPLIT( + ITER_MARK_SUM(ITER_SUM(ITER_SPLIT(ITER_MARK_VAR(i), ir::IndexExpr(4)), + ITER_SPLIT(ITER_MARK_VAR(j), ir::IndexExpr(1))), + ir::IndexExpr(8)))); + + ir::Expr e1 = (i * 32 + j * 8 + k) / 8; + ir::Expr e2 = (i * 32 + j * 8) / 8; + ir::Expr e3 = (i * 32 + j * 7) / 8; + + TEST_EXPR(e1, gt1, (i * 32 + j * 8 + k) / 8); + TEST_EXPR(e2, gt2, i * 4 + j); + EXPECT_ANY_THROW(rewriter.Rewrite(&e3)); +} + } // namespace common } // namespace cinn From f56e672ecb7ddc30e856839d951fc1abc704511e Mon Sep 17 00:00:00 2001 From: crazyxiaoxi <113622186+crazyxiaoxi@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:10:02 +0800 Subject: [PATCH 123/135] =?UTF-8?q?[CINN]=20=E3=80=90Infer=20Symbolic=20Sh?= =?UTF-8?q?ape=20BUAA=20=E3=80=91Add=20flashmask=5Fattention=20op=20=20(#6?= =?UTF-8?q?8385)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * first * fix build * fix --- .../multiary_infer_sym.cc | 69 +++++++++++++++++++ .../infer_symbolic_shape/multiary_infer_sym.h | 1 + paddle/phi/ops/yaml/ops.yaml | 2 +- 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 9862430317afd..b7084d18a515b 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -1748,6 +1748,75 @@ bool FlashAttnVarlenQkvpackedOpInferSymbolicShape( // return true; // } +bool FlashmaskAttentionOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const symbol::ShapeOrDataDimExprs &q = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + const symbol::ShapeOrDataDimExprs &k = + infer_context->GetShapeOrDataForValue(op->operand_source(1)); + const symbol::ShapeOrDataDimExprs &v = + infer_context->GetShapeOrDataForValue(op->operand_source(2)); + + PADDLE_ENFORCE_EQ(q.shape().size(), + 4, + common::errors::InvalidArgument( + "flash_attn receive input with dim " + "[batch_size, seq_len, num_heads, head_dim]")); + + infer_context->AddEqualCstr(q.shape()[0], k.shape()[0]); + infer_context->AddEqualCstr(q.shape()[0], v.shape()[0]); + infer_context->AddEqualCstr(k.shape()[1], v.shape()[1]); + + if (op->operand_source(3)) { + const std::vector &startend_row_indices = + infer_context->GetShapeOrDataForValue(op->operand_source(4)).shape(); + PADDLE_ENFORCE_EQ( + startend_row_indices.size(), + 4, + common::errors::InvalidArgument( + "flashmask_attention receive startend_row_indices with dim " + "[batch_size, num_heads,seq_len, mask_bounds]")); + } + std::vector out_shape = q.shape(); + + out_shape.back() = v.shape().back(); + + infer_context->SetShapeOrDataForValue( + op->result(0), symbol::TensorShapeOrDataDimExprs(out_shape)); + + // GPU has round for seqlen, but XPU has not. Here we align with the GPU + // version. + auto round_multiple = [](symbol::DimExpr x) { + auto m = symbol::DimExpr{128}; + auto m_minus_one = symbol::DimExpr{127}; + return (x + m_minus_one) / m * m; + }; + auto batch_size_expr = q.shape()[0]; + auto num_heads_expr = q.shape()[2]; + auto seqlen_q_rounded_expr = round_multiple(q.shape()[1]); + auto seqlen_k_rounded_expr = round_multiple(k.shape()[1]); + + if (op->result(1)) { + std::vector softmax_shape{batch_size_expr, + num_heads_expr, + seqlen_q_rounded_expr, + seqlen_k_rounded_expr}; + infer_context->SetShapeOrDataForValue( + op->result(1), symbol::TensorShapeOrDataDimExprs(softmax_shape)); + } + if (op->result(2)) { + std::vector softmax_lse_shape{ + batch_size_expr, num_heads_expr, seqlen_q_rounded_expr}; + infer_context->SetShapeOrDataForValue( + op->result(2), symbol::TensorShapeOrDataDimExprs(softmax_lse_shape)); + } + if (op->result(3)) { + std::vector seed_offset_shape{symbol::DimExpr{2}}; + infer_context->SetShapeOrDataForValue( + op->result(3), symbol::TensorShapeOrDataDimExprs(out_shape)); + } + return true; +} bool FusedBatchNormActOpInferSymbolicShape( pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { return BatchNormOpInferSymbolicShape(op, infer_context); diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h index 02dda29cd5a06..b71cbea1883de 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h @@ -57,6 +57,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedFeedforward) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedAttention) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnVarlenQkvpacked) // OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashAttnUnpadded) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(FlashmaskAttention) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedBatchNormAct) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedBatchNormAct_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(FusedBnAddActivation) diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index fe6aa5fc618df..a607870f3f475 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -1996,7 +1996,7 @@ func : flashmask_attention data_type : q backward : flashmask_attention_grad - # interfaces : paddle::dialect::InferSymbolicShapeInterface + interfaces : paddle::dialect::InferSymbolicShapeInterface - op : flatten args : (Tensor x, int start_axis = 1, int stop_axis = 1) From ab8d6b65585c3ff0273cc6c31a44510ae846accd Mon Sep 17 00:00:00 2001 From: blacksheep-Aristotle Date: Tue, 15 Oct 2024 11:20:49 +0800 Subject: [PATCH 124/135] [AutoParallel] fix grade_merge bug (#68664) --- .../passes/auto_parallel_gradient_merge.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py index 524832bcd1895..8aee7e62382e7 100644 --- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py +++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py @@ -332,6 +332,18 @@ def _pir_append_gradient_merge_backward_op( grad_defining_op.dist_attr.chunk_id, ) ) + # NOTE(zhangweilong): grad may in different device in auto_parallel, so need consider all_gather op + for used_grad_op in grad.all_used_ops(): + if used_grad_op.name() != "pd_op.all_gather": + continue + move_to_opt_block_flag = True + for all_gather_result in used_grad_op.results(): + for used_op in all_gather_result.all_used_ops(): + if used_op.op_role != int(OpRole.Optimize): + move_to_opt_block_flag = False + break + if move_to_opt_block_flag: + used_grad_op.op_role = int(OpRole.Optimize) opt_ops_use_grad = [ op From e298e59f35a1ab2ac5b6bd4441d360c3c856887a Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:21:19 +0800 Subject: [PATCH 125/135] fix shape compute bug (#68699) --- paddle/cinn/common/dim_expr_converter.cc | 3 ++- paddle/cinn/hlir/pe/elementwise.cc | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/cinn/common/dim_expr_converter.cc b/paddle/cinn/common/dim_expr_converter.cc index c50a075a6f0bc..cedfbb841f311 100644 --- a/paddle/cinn/common/dim_expr_converter.cc +++ b/paddle/cinn/common/dim_expr_converter.cc @@ -153,7 +153,8 @@ struct DimExprConverterWithSymbolBindings:: return inputs_[input_idx]->sym_shape[input_dim_idx]->GetDimExpr(); } // for data binding [S0, a, b], inputs[a] is Tensor A, return A(b) - return inputs_[input_idx](cinn::ir::Expr(input_dim_idx)); + return ir::Cast::Make(cinn::common::I64(), + inputs_[input_idx](cinn::ir::Expr(input_dim_idx))); } DimExprToIrExprVisitorWithSymbolBinding( diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc index 6a8ff68a5f77c..de0d8b63d872e 100644 --- a/paddle/cinn/hlir/pe/elementwise.cc +++ b/paddle/cinn/hlir/pe/elementwise.cc @@ -356,8 +356,8 @@ ir::Tensor GenerateShape(const std::vector& inputs, VLOG(4) << "pe::GenerateShape will return a meaningless tensor when " "output_dim_exprs.size() != 1"; return Compute( - {Expr(1)}, - [=](const std::vector& indice) { return Expr(1); }, + {Expr(1l)}, + [=](const std::vector& indice) { return Expr(1l); }, name); } cinn::common::DimExprConverterWithSymbolBindings converter(inputs, From 201080c0cc10520d456ddd9266ffe3af400ee8b3 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:21:28 +0800 Subject: [PATCH 126/135] fix concat op cpu bug (#68691) --- paddle/cinn/hlir/pe/transform.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc index e71fb8b109558..844b8d7ac6cb3 100644 --- a/paddle/cinn/hlir/pe/transform.cc +++ b/paddle/cinn/hlir/pe/transform.cc @@ -494,10 +494,14 @@ ir::Tensor Concat(const std::vector& input_tensors, accumulate_shape = cinn::common::AutoSimplify( accumulate_shape + input_tensors[i]->shape[axis]); std::vector new_indice = indice; - new_indice[axis] = indice[axis] - accumulate_shape; - ret = ir::Select::Make(indice[axis] < accumulate_shape, - ret, - input_tensors[i + 1](new_indice)); + new_indice[axis] = + ir::Cast::Make(accumulate_shape.type(), indice[axis]) - + accumulate_shape; + ret = + ir::Select::Make(ir::Cast::Make(accumulate_shape.type(), + indice[axis]) < accumulate_shape, + ret, + input_tensors[i + 1](new_indice)); } return ret; }, From 4c1ba9a2e5006bea0ea711c788884f780285547d Mon Sep 17 00:00:00 2001 From: Shuhao Liang <50269654+lshpku@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:23:39 +0800 Subject: [PATCH 127/135] [CINN] Support grid reduce for single SR reduce (#68220) --- paddle/cinn/backends/codegen_device_util.cc | 38 +++++++++++++++ paddle/cinn/backends/codegen_device_util.h | 4 ++ .../hlir/framework/pir/compilation_cache.cc | 1 + .../hlir/framework/pir/compilation_cache.h | 10 +++- .../hlir/framework/pir/compilation_task.cc | 6 ++- .../hlir/framework/pir/op_lowering_group.h | 9 ++++ .../hlir/framework/pir/op_lowering_impl.cc | 18 +++++++ paddle/cinn/hlir/framework/pir/utils.h | 5 ++ .../config/group_tile_config.cc | 28 +++++++++++ .../group_schedule/config/group_tile_config.h | 2 + .../tactic/tile_first_general_tactic.cc | 48 +++++++++++++------ paddle/cinn/ir/lowered_func.h | 4 ++ paddle/cinn/ir/utils/ir_copy.cc | 2 + .../optim/replace_cross_block_reduction.cc | 29 +++++++---- .../instruction/cinn_jit_instruction.cc | 34 ++++++++++++- .../instruction/cinn_jit_instruction.h | 4 ++ 16 files changed, 213 insertions(+), 29 deletions(-) diff --git a/paddle/cinn/backends/codegen_device_util.cc b/paddle/cinn/backends/codegen_device_util.cc index 92a5b76c504eb..1a1d344f06179 100644 --- a/paddle/cinn/backends/codegen_device_util.cc +++ b/paddle/cinn/backends/codegen_device_util.cc @@ -262,6 +262,24 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc( ir::CallType::Extern, ir::FunctionRef(), 0); + + // create memset calls for temp_spaces if needed + std::vector call_kernel_stmts; + for (auto &temp_space : func_node->temp_spaces) { + if (temp_space.need_zero_init()) { + ir::Expr size = common::cast(temp_space.size(), common::UInt(64)); + ir::Expr call_get_arg = + lang::CallExtern(runtime::intrinsic::get_item_in_cuda_kernel_args, + {kernel_args_, ir::Expr(temp_space.arg_idx())}); + ir::Expr call_memset = lang::CallExtern( + runtime::intrinsic::call_cuda_memset, + {call_get_arg, ir::Expr(1), ir::Expr(0), size, kernel_stream_}); + call_kernel_stmts.push_back(call_memset); + } + } + call_kernel_stmts.push_back(call_extern_api); + call_extern_api = ir::Block::Make(call_kernel_stmts); + if (buckets_.empty()) { buckets_.emplace_back(ir::IfThenElse::Make(predicate, call_extern_api)); } else { @@ -270,6 +288,26 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc( buckets_.emplace_back( ir::IfThenElse::Make(predicate, call_extern_api, false_expr)); } + + // create infer shape calls for temp_spaces + std::vector temp_space_infer_shape_stmts; + for (int i = 0; i < func_node->temp_spaces.size(); ++i) { + ir::Var tensor_shape_args(TENSOR_SHAPE_ARGS, type_of()); + ir::Expr size = + common::cast(func_node->temp_spaces[i].size(), common::Int(64)); + ir::Expr call_set_value = + lang::CallExtern(runtime::intrinsic::infer_shape_set_value, + {ir::Expr(func_node->num_output_tensors + i), + ir::Expr(0), + size, + tensor_shape_args}); + temp_space_infer_shape_stmts.push_back(call_set_value); + } + if (!temp_space_infer_shape_stmts.empty()) { + ir::Expr if_body = ir::Block::Make(temp_space_infer_shape_stmts); + temp_space_infer_shape_body_ = + ir::IfThenElse::Make(predicate, if_body, temp_space_infer_shape_body_); + } } void detail::CollectBucketStrategyHostFunctionVisitor::ProcessArgs( diff --git a/paddle/cinn/backends/codegen_device_util.h b/paddle/cinn/backends/codegen_device_util.h index b345c35598ec2..c0351ac59930c 100644 --- a/paddle/cinn/backends/codegen_device_util.h +++ b/paddle/cinn/backends/codegen_device_util.h @@ -280,6 +280,9 @@ struct CollectBucketStrategyHostFunctionVisitor infer_shape_func_body_stmts.insert( infer_shape_func_body_stmts.end(), op->infer_shape_func.as_lowered_func()->body); + if (temp_space_infer_shape_body_.defined()) { + infer_shape_func_body_stmts.push_back(temp_space_infer_shape_body_); + } std::vector infer_shape_arguments = { ir::Argument(kernel_args_, ir::Argument::IO::kOutput), @@ -307,6 +310,7 @@ struct CollectBucketStrategyHostFunctionVisitor private: std::vector buckets_; std::vector arg_defs_; + ir::Expr temp_space_infer_shape_body_; ir::Var kernel_args_; ir::Var kernel_args_num_; diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.cc b/paddle/cinn/hlir/framework/pir/compilation_cache.cc index 0b94a67d8784a..1843f0f3f57d0 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_cache.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_cache.cc @@ -54,6 +54,7 @@ pir::CINNKernelInfo BackendResource::GenerateKernelInfo() const { kernel_info.infer_shape_fn_ptr = GetInferFuncPtr(); kernel_info.CX86_fn_ptr = GetCX86HostFuncPtr(); kernel_info.symbol_args_map = GetSymbolArgsMap(); + kernel_info.temp_space_sizes = GetTempSpaceSizes(); return kernel_info; } } // namespace pir diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.h b/paddle/cinn/hlir/framework/pir/compilation_cache.h index 13edd70be6766..bea6631cbbd9a 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_cache.h +++ b/paddle/cinn/hlir/framework/pir/compilation_cache.h @@ -33,10 +33,12 @@ class BackendResource final { const Target& target, const std::string& host_fn_name, const std::string& infer_fn_name, - const std::map& symbol_args_map) + const std::map& symbol_args_map, + const std::vector& temp_space_sizes) : host_fn_name_(host_fn_name), infer_fn_name_(infer_fn_name), - symbol_args_map_(symbol_args_map) { + symbol_args_map_(symbol_args_map), + temp_space_sizes_(temp_space_sizes) { backend_compiler_ = backends::Compiler::Create(target); } @@ -47,6 +49,9 @@ class BackendResource final { const { return symbol_args_map_; } + const std::vector& GetTempSpaceSizes() const { + return temp_space_sizes_; + } const std::shared_ptr& GetBackendCompiler() const { return backend_compiler_; } @@ -57,6 +62,7 @@ class BackendResource final { std::string host_fn_name_; std::string infer_fn_name_; std::map symbol_args_map_; + std::vector temp_space_sizes_; std::shared_ptr backend_compiler_{nullptr}; }; diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc index a45657252a315..29b46fd4dabbd 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc @@ -216,7 +216,8 @@ std::shared_ptr CompilationTask::BuildPirCINNKernelInfo( context_->target_, context_->group_->FuncName(), context_->group_->FuncName() + "_infer_shape", - context_->group_->symbol_args_map()); + context_->group_->symbol_args_map(), + context_->group_->temp_space_sizes()); VLOG(5) << "Start to compile module into cuda kernel..."; backend_resource->GetBackendCompiler()->Build(module, ""); backend_resource->GetBackendCompiler()->AppendCX86(CX86module); @@ -236,7 +237,8 @@ CompilationTask::CompileBroadcastModules( context_->target_, context_->group_->FuncName(), context_->group_->FuncName() + "_infer_shape", - context_->group_->symbol_args_map()); + context_->group_->symbol_args_map(), + context_->group_->temp_space_sizes()); std::vector case_func_names; std::vector broadcast_conditions; diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.h b/paddle/cinn/hlir/framework/pir/op_lowering_group.h index 7299cb17eed5f..8ca97568c7419 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_group.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.h @@ -185,6 +185,14 @@ class OpLoweringGroup { return this->symbol_args_map_; } + const std::vector& temp_space_sizes() const { + return this->temp_space_sizes_; + } + + std::vector& mut_temp_space_sizes() { + return this->temp_space_sizes_; + } + private: using alignment_schedule_info_t = std::unordered_map< ::pir::Operation*, @@ -231,6 +239,7 @@ class OpLoweringGroup { std::vector output_names_; std::vector<::pir::Value> output_values_; std::map symbol_args_map_; + std::vector temp_space_sizes_; alignment_schedule_info_t alignment_schedule_info_; std::vector reduce_axis_; diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 97faca515b440..ae88bc5ac604a 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -117,6 +117,12 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower( func_bodies = OperationFusion(ops, func_bodies, group->fusion_tracker_ptr); std::shared_ptr fusion_group_info = GetFusionGroupInfo(func_bodies); + // TODO(liangshuhao): grid reduce is disabled for broadcast-leaf group now, + // because grid reduce introduces extra func args that currently cannot be + // unified with other broadcast-leaf groups. + if (group->IsBroadcastLeaf()) { + fusion_group_info->can_apply_grid_reduce = false; + } if (FLAGS_cinn_check_tensor_buffer_map) { optim::CheckTensorBufferMap(func_bodies, "BucketLower OpFusion"); @@ -386,9 +392,21 @@ std::vector OpLowererImpl::PostProcess( func = optim::Optimize(Expr(func), common::DefaultHostTarget(), false) .as_lowered_func_ref(); } + func->num_output_tensors = infer_shape_arg_tensor->size(); lowered_funcs.push_back(std::move(func)); } + // collect temp space sizes + if (lowered_funcs.size() > 1) { + for (auto& temp_space : lowered_funcs[0]->temp_spaces) { + int64_t size = -1; + if (temp_space.size().is_constant()) { + size = temp_space.size().as_int64(); + } + group->mut_temp_space_sizes().push_back(size); + } + } + return lowered_funcs; } diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h index ae3ed8a16e326..532ebf812c8be 100644 --- a/paddle/cinn/hlir/framework/pir/utils.h +++ b/paddle/cinn/hlir/framework/pir/utils.h @@ -66,6 +66,11 @@ struct CINNKernelInfo { // 3: ArgValueIdx{1, 6} // } std::map symbol_args_map; + + // Sizes in bytes of the temporary global spaces needed by the kernel. + // These spaces are allocated before the kernel is launched, appended to the + // kernel's argument list, and released when the kernel completes. + std::vector temp_space_sizes; }; struct CompatibleInfo { diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc index 810e256c2bdaa..7a39539d1953b 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc +++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc @@ -95,6 +95,7 @@ std::shared_ptr InitBasicInfo( std::make_shared(); base_info->data_rank = group_info->loop_ranges.size(); base_info->loop_strides = group_info->loop_strides; + base_info->can_apply_grid_reduce = group_info->can_apply_grid_reduce; std::set reduce_dim_loc; for (int64_t dim : group_info->reduce_axis) { @@ -179,6 +180,19 @@ BuildPureStaticShapeConfig( /* spatial_inner_num = */ 1, /* reduce_method = */ BlockReduceMethod()}; return {{bucket_info, tile_config}}; + } else if (base_info->can_apply_grid_reduce && + base_info->reduce_numel > 65536) { + BucketInfo bucket_info{/* sp_lower_bound = */ 1, + /* sp_upper_bound = */ 1, + /* rb_lower_bound = */ 2049, + /* rb_upper_bound = */ kMaxNumel}; + ScheduleConfig::TileConfig tile_config{ + /* warp_num = */ 32, + /* tree_reduce_num = */ 1024, + /* spatial_inner_num = */ 1, + /* reduce_method = */ BlockReduceMethod(), + /* grid_reduce_num = */ 8}; + return {{bucket_info, tile_config}}; } else { BucketInfo bucket_info{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ 1, @@ -244,6 +258,20 @@ BuildPureStaticShapeConfig( /* spatial_inner_num = */ spatial_inner_num, /* reduce_method = */ BlockReduceMethod()}; return {{bucket_info, tile_config}}; + } else if (base_info->can_apply_grid_reduce && + base_info->spatial_numel <= 256 && + base_info->reduce_numel > 65536) { + BucketInfo bucket_info{/* sp_lower_bound = */ 1, + /* sp_upper_bound = */ 256, + /* rb_lower_bound = */ 65537, + /* rb_upper_bound = */ kMaxNumel}; + ScheduleConfig::TileConfig tile_config{ + /* warp_num = */ 32, + /* tree_reduce_num = */ 1024, + /* spatial_inner_num = */ 1, + /* reduce_method = */ BlockReduceMethod(), + /* grid_reduce_num = */ 8}; + return {{bucket_info, tile_config}}; } else { int64_t warp_num = 32; int64_t spatial_inner_num = 1; diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.h b/paddle/cinn/ir/group_schedule/config/group_tile_config.h index 93dae2982767a..996a2bdd5da3a 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_config.h +++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.h @@ -40,6 +40,7 @@ struct ScheduleConfig { bool has_dynamic_spatial{false}; bool has_dynamic_reduce{false}; bool is_reduce_all{false}; + bool can_apply_grid_reduce{false}; IterSpaceType iter_space_type; }; @@ -48,6 +49,7 @@ struct ScheduleConfig { int64_t tree_reduce_num{1}; int64_t spatial_inner_num{1}; ReduceMethod reduce_method{NoneReduceMethod()}; + int64_t grid_reduce_num{1}; }; std::shared_ptr base_info; diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc index 6d80d33d6b997..6e12bb01af12f 100644 --- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc @@ -164,9 +164,11 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile( context_->config.tile_config.tree_reduce_num; const auto sp_loop = context_->config.tile_config.spatial_inner_num; const auto rd_thread = context_->config.tile_config.tree_reduce_num; + const auto rd_block = context_->config.tile_config.grid_reduce_num; VLOG(4) << "ApplyContinuousDataTile sp_thread=" << sp_thread; VLOG(4) << "ApplyContinuousDataTile sp_loop=" << sp_loop; VLOG(4) << "ApplyContinuousDataTile rd_thread=" << rd_thread; + VLOG(4) << "ApplyContinuousDataTile rd_block=" << rd_block; VLOG(4) << "ApplyContinuousDataTile vec_flatten_axis: " << utils::Join(vec_flatten_axis_, ", "); VLOG(4) << "ApplyContinuousDataTile vec_reduce_axis: " @@ -204,21 +206,14 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile( VLOG(4) << "After SplitSptial on block: [" << block_id << "], loop nest:\n" << sch->GetModule().GetExprs().front(); - // Split reduce axes -> [rd_loop, rd_thread] + // Split reduce axes -> [rd_loop, rd_block, rd_thread] + std::string global_rf_block; if (vec_reduce_axis_.size() > 0) { auto loops = sch->GetLoops(block_id); - // [S..S, R] => [S..S, R(-1), R(thread)] - sch->Split(loops[current_reduce_axis], {-1, rd_thread}); - VLOG(4) << "Before ReorderReduction on block: [" << block_id - << "], loop nest:\n" - << sch->GetModule().GetExprs().front(); + sch->Split(loops[current_reduce_axis], {-1, rd_block * rd_thread}); loops = sch->GetLoops(block_id); - // [S..S, R(-1), R(thread)] => [S..S, R(thread), R(-1)] sch->Reorder({loops[current_reduce_axis + 1], loops[current_reduce_axis]}); - VLOG(4) << "Before FactorizeReduction on block: [" << block_id - << "], loop nest:\n" - << sch->GetModule().GetExprs().front(); if (IsReductionSBlock(sch->GetBlock(block_id))) { loops = sch->GetLoops(block_id); @@ -228,6 +223,24 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile( /* with_write_back_block_init = */ false); map_rf_block_[block_id] = rf_tensor.as_tensor_ref()->name; } + + if (rd_block > 1) { + loops = sch->GetLoops(block_id); + sch->Split(loops[current_reduce_axis], {rd_block, rd_thread}); + + if (IsReductionSBlock(sch->GetBlock(block_id))) { + loops = sch->GetLoops(map_rf_block_[block_id]); + sch->Split(loops[current_reduce_axis], {rd_block, rd_thread}); + + loops = sch->GetLoops(block_id); + ir::Expr rf_tensor = + sch->FactorizeReduction(loops[current_reduce_axis], + /* rf_axis = */ 0, + /* with_write_back_block_init = */ false); + global_rf_block = rf_tensor.as_tensor_ref()->name; + rf_tensor.as_tensor_ref()->WithBuffer("global", "_" + global_rf_block); + } + } } VLOG(4) << "After SplitReduce on block: [" << block_id << "], loop nest:\n" << sch->GetModule().GetExprs().front(); @@ -248,16 +261,23 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile( } } if (!vec_reduce_axis_.empty() && current_reduce_axis > 0) { - // [S(blockIdx.x), optional(inner_loop), S(threadIdx.y), R..R] => - // [S(blockIdx.x), optional(inner_loop), S(threadIdx.y), R(threadIdx.x), - // R(inner_loop)] - sch->Bind(loops[current_reduce_axis], rd_axis_type); + if (rd_block > 1) { + sch->Bind(loops[current_reduce_axis], "blockIdx.y"); + if (loops.size() > current_reduce_axis + 1) { + sch->Bind(loops[current_reduce_axis + 1], rd_axis_type); + } + } else { + sch->Bind(loops[current_reduce_axis], rd_axis_type); + } } }; DoBind(sch->GetLoops(block_id)); if (map_rf_block_.count(block_id) > 0) { DoBind(sch->GetLoops(map_rf_block_[block_id])); } + if (!global_rf_block.empty()) { + DoBind(sch->GetLoops(global_rf_block)); + } VLOG(4) << "After BindCudaInfo on block: [" << block_id << "], loop nest:\n" << sch->GetModule().GetExprs().front(); diff --git a/paddle/cinn/ir/lowered_func.h b/paddle/cinn/ir/lowered_func.h index 224304bbdb23e..22c2f1c5ce6bd 100644 --- a/paddle/cinn/ir/lowered_func.h +++ b/paddle/cinn/ir/lowered_func.h @@ -157,6 +157,10 @@ struct _LoweredFunc_ : ExprNode<_LoweredFunc_> { //! argument list. std::vector temp_spaces; + //! Number of output tensors that appear in the function's argument list. + //! This number doesn't include temp_spaces. + int num_output_tensors; + //! Body of this function. Expr body; diff --git a/paddle/cinn/ir/utils/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc index 524fd4cab02a8..6362459c242d6 100644 --- a/paddle/cinn/ir/utils/ir_copy.cc +++ b/paddle/cinn/ir/utils/ir_copy.cc @@ -297,6 +297,8 @@ struct IRCopyVisitor : public ir::IRVisitorRequireReImpl { func->args = op->args; func->body = Visit(&op->body); func->temp_bufs = op->temp_bufs; + func->temp_spaces = op->temp_spaces; + func->num_output_tensors = op->num_output_tensors; func->device_api = op->device_api; diff --git a/paddle/cinn/optim/replace_cross_block_reduction.cc b/paddle/cinn/optim/replace_cross_block_reduction.cc index 8cbe3e1191780..3074528a26605 100644 --- a/paddle/cinn/optim/replace_cross_block_reduction.cc +++ b/paddle/cinn/optim/replace_cross_block_reduction.cc @@ -90,6 +90,24 @@ struct CrossBlockReductionReplacer : public ir::IRMutator<> { return reduce_var_names.count(innermost_loop->loop_var->name) > 0; } + void InsertTempSpaceToFuncArgs(ir::_LoweredFunc_* func_node, + const ir::Buffer& buffer, + bool need_zero_init) { + // insert the temp space after the last tensor argument and before the + // first scalar argument + auto insert_pos = + std::find_if(func_node->args.begin(), + func_node->args.end(), + [](const ir::Argument& arg) { return arg.is_var(); }); + + int arg_idx = std::distance(func_node->args.begin(), insert_pos); + func_node->temp_spaces.emplace_back( + CalcBufferSizeInBytes(buffer), arg_idx, need_zero_init); + + ir::Argument temp_space_arg(buffer, ir::Argument::IO::kOutput); + func_node->args.insert(insert_pos, temp_space_arg); + } + void ConvertHeapBuffersToFuncArgs(ir::_LoweredFunc_* func_node) { std::vector global_bufs; std::vector local_bufs; @@ -108,9 +126,7 @@ struct CrossBlockReductionReplacer : public ir::IRMutator<> { "Currently supports at most one global buffer.")); for (auto& buf : global_bufs) { - func_node->temp_spaces.emplace_back( - CalcBufferSizeInBytes(buf), /* arg_idx= */ func_node->args.size()); - func_node->args.emplace_back(buf, ir::Argument::IO::kOutput); + InsertTempSpaceToFuncArgs(func_node, buf, false); } func_node->temp_bufs = local_bufs; } @@ -231,13 +247,8 @@ struct CrossBlockReductionReplacer : public ir::IRMutator<> { ir::_LoweredFunc_* func_node = op->As(); ConvertHeapBuffersToFuncArgs(func_node); - + InsertTempSpaceToFuncArgs(func_node, semaphore_buffer_, true); func_node->temp_bufs.push_back(is_done_tensor_->buffer); - func_node->temp_spaces.emplace_back( - CalcBufferSizeInBytes(semaphore_buffer_), - /* arg_idx= */ func_node->args.size(), - /* need_zero_init = */ true); - func_node->args.emplace_back(semaphore_buffer_, ir::Argument::IO::kOutput); } void Visit(const ir::ScheduleBlockRealize* expr, ir::Expr* op) override { diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc index f637643be445b..66878b263e17f 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc @@ -42,8 +42,6 @@ class CinnJitInstruction::FnPtrImpl { : cinn_kernel_info_(cinn_kernel_info) {} void InitFuncArgs(const std::vector& kernel_tensor_args) { - func_args_.clear(); - // 1. Create placeholders for tensor args for (size_t i = 0; i < kernel_tensor_args.size(); ++i) { auto* buffer = new cinn_buffer_t(); @@ -180,6 +178,15 @@ class CinnJitInstruction::FnPtrImpl { VLOG(6) << "End InferShape: " << cinn_kernel_info_.fn_name; } + void FreeFuncArgs() { + for (auto& arg : func_args_) { + if (arg.type_code() == ::cinn_type_code()) { + delete cinn_pod_value_to_buffer_p(&arg); + } + } + func_args_.clear(); + } + private: CINNKernelInfo cinn_kernel_info_; @@ -202,6 +209,7 @@ CinnJitInstruction::CinnJitInstruction( InitInputsOutputsIds(op, *value_exec_info); + // prepare input tensors for (size_t i = 0; i < op->num_operands(); ++i) { auto in = op->operand_source(i); @@ -220,6 +228,7 @@ CinnJitInstruction::CinnJitInstruction( } dev_ctx_ = phi::DeviceContextPool::Instance().Get(place_); + // prepare output tensors for (size_t i = 0; i < op->num_results(); ++i) { pir::Value result = op->result(i); auto var_name = value_exec_info->GetVarName(result); @@ -241,6 +250,20 @@ CinnJitInstruction::CinnJitInstruction( } tensor->Resize(alloc_tensor_type.dims()); } + + // prepare temp_space tensors + for (int64_t size : jit_kernel_op.cinn_kernel_info().temp_space_sizes) { + auto& tensor = temp_space_tensors_.emplace_back(); + tensor.set_type(phi::DataType::UINT8); + tensor.Resize({size}); + if (size < 0) { + need_update_shape = true; + } + } + for (auto& tensor : temp_space_tensors_) { + tensor_args_.push_back(&tensor); + } + output_tensor_size += temp_space_tensors_.size(); } void CinnJitInstruction::Run() { @@ -254,6 +277,7 @@ void CinnJitInstruction::Run() { static_cast(static_cast(dev_ctx_)->stream()); } + // 1. prepare kernel argmuments fn_ptr_impl_->InitFuncArgs(tensor_args_); if (FLAGS_cinn_bucket_compile && need_update_shape) { @@ -266,6 +290,12 @@ void CinnJitInstruction::Run() { // 2. exexute kernel fn_ptr_impl_->Run(tensor_args_, running_stream, is_gpu); + + // 3. release resource + fn_ptr_impl_->FreeFuncArgs(); + for (auto& tensor : temp_space_tensors_) { + tensor.clear(); + } #else VLOG(0) << "Not Supported: cinn jit instruction currently does not " "support non-CUDA kernel"; diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h index 73161731f7d3a..43421b39c9907 100644 --- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h @@ -55,6 +55,10 @@ class CinnJitInstruction : public InstructionBase { bool need_update_shape{false}; std::vector tensor_args_; + // Tensors that hold the temporary spaces used by the kernel. These tensors + // are managed by CinnJitInstruction, and not exposed to phi executor. + std::vector temp_space_tensors_; + ::pir::Operation* op_{nullptr}; // not owned }; From aa092b6a1a4f8c2202ea8d376b0a53e263152ef2 Mon Sep 17 00:00:00 2001 From: zhink <33270771+zhink@users.noreply.github.com> Date: Tue, 15 Oct 2024 12:45:54 +0800 Subject: [PATCH 128/135] add block_size=1 for beam_search (#68701) --- paddle/phi/kernels/fusion/gpu/block_attn.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h index 8127f8026f3da..e6dbc3fd15687 100644 --- a/paddle/phi/kernels/fusion/gpu/block_attn.h +++ b/paddle/phi/kernels/fusion/gpu/block_attn.h @@ -1560,6 +1560,10 @@ void dispatch_blha_impl_blocksize(const Block_AttN_params ¶ms, StoreFunc store_func, const int use_cachekv_int8) { switch (params.block_size) { + case 1: + dispatch_blha_impl_key_and_thread( + params, stream, load_func, store_func, use_cachekv_int8); + break; case 32: dispatch_blha_impl_key_and_thread( params, stream, load_func, store_func, use_cachekv_int8); From ea0f4ba21a97e73adbffa3068200492047b4ce1e Mon Sep 17 00:00:00 2001 From: co63oc Date: Tue, 15 Oct 2024 13:01:54 +0800 Subject: [PATCH 129/135] =?UTF-8?q?=E3=80=90Hackathon=207th=20Fundable=20P?= =?UTF-8?q?rojects=201=20No.101=E3=80=91=20[fluid=5Fops]=20faster=5Ftokeni?= =?UTF-8?q?zer=20(#68361)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/framework/CMakeLists.txt | 5 - paddle/fluid/framework/feed_fetch_type.h | 20 +- paddle/fluid/framework/operator.cc | 3 + paddle/fluid/framework/string_array.h | 117 +--- paddle/fluid/framework/tensor_ref_array.h | 12 +- paddle/fluid/framework/type_info.cc | 2 - paddle/fluid/imperative/prepared_operator.h | 17 +- paddle/fluid/operators/CMakeLists.txt | 4 +- .../ops_signature/faster_tokenizer_sig.cc | 33 + paddle/fluid/operators/string/CMakeLists.txt | 2 +- .../operators/string/faster_tokenizer_op.cc | 423 +----------- .../operators/string/faster_tokenizer_op.h | 210 ------ .../pir/dialect/op_generator/ops_api_gen.py | 1 + paddle/phi/core/CMakeLists.txt | 1 + paddle/phi/core/kernel_registry.cc | 14 + paddle/phi/core/kernel_utils.h | 5 + paddle/phi/core/utils/type_info.cc | 3 + paddle/phi/core/vocab/CMakeLists.txt | 1 + .../core/vocab}/phi_tensor_base_vector.h | 14 +- .../core/vocab}/string_array.cc | 37 +- paddle/phi/core/vocab/string_array.h | 142 ++++ paddle/phi/infermeta/ternary.cc | 16 + paddle/phi/infermeta/ternary.h | 11 + .../kernels/cpu/faster_tokenizer_kernel.cc | 617 ++++++++++++++++++ .../phi/ops/yaml/inconsistent/static_ops.yaml | 11 + paddle/phi/ops/yaml/op_compat.yaml | 6 + 26 files changed, 932 insertions(+), 795 deletions(-) create mode 100644 paddle/fluid/operators/ops_signature/faster_tokenizer_sig.cc delete mode 100644 paddle/fluid/operators/string/faster_tokenizer_op.h create mode 100644 paddle/phi/core/vocab/CMakeLists.txt rename paddle/{fluid/framework => phi/core/vocab}/phi_tensor_base_vector.h (92%) rename paddle/{fluid/framework => phi/core/vocab}/string_array.cc (78%) create mode 100644 paddle/phi/core/vocab/string_array.h create mode 100644 paddle/phi/kernels/cpu/faster_tokenizer_kernel.cc diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 516b80506b40e..b2e5e539c9bd8 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -101,11 +101,6 @@ foreach(OP_DEF_FILE ${OP_DEF_FILES}) endforeach() file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}") -cc_library( - string_array - SRCS string_array.cc - DEPS utf8proc phi common) - cc_library( data_type SRCS data_type.cc diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 6be31a062ef07..5f7c81c90f18f 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -21,13 +21,12 @@ limitations under the License. */ #include "paddle/fluid/framework/string_array.h" #include "paddle/phi/core/extended_tensor.h" -namespace paddle { -namespace framework { -using FeedType = - paddle::variant; +namespace phi { +using FeedType = paddle:: + variant; using FetchType = paddle::variant; template <> @@ -40,9 +39,16 @@ struct PhiVectorType { const char *type_name = "PhiVectorFetchType"; }; -using FeedList = paddle::framework::PhiVector; -using FetchList = paddle::framework::PhiVector; +using FeedList = PhiVector; +using FetchList = PhiVector; +} // namespace phi +namespace paddle { +namespace framework { +using FeedType = phi::FeedType; +using FetchType = phi::FetchType; +using FeedList = phi::FeedList; +using FetchList = phi::FetchList; using FetchUnmergedList = std::vector>; inline bool data_is_lod_tensor(const FetchType &data) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 03e9a0532b78c..44f76ecb21791 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -3257,6 +3257,9 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (var->IsType()) { tensor_in = &(var->Get()); phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var->IsType()) { + tensor_in = &(var->Get()); + phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var->IsType()) { tensor_in = &(var->Get()); phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h index ddcc15e3dca59..fc3f3d8146a98 100644 --- a/paddle/fluid/framework/string_array.h +++ b/paddle/fluid/framework/string_array.h @@ -14,119 +14,4 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/phi_tensor_base_vector.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/extended_tensor.h" - -namespace paddle { -namespace framework { - -// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't -// recommend widely use it. Because faster_tokenizer_op may be deleted in the -// future and this class will be deleted. - -class Vocab : public phi::ExtendedTensor, - public phi::TypeInfoTraits { - public: - Vocab() = default; - - Vocab(Vocab&& other) = default; - - Vocab(const Vocab& other) = default; - - Vocab& operator=(const Vocab& other) = default; - - Vocab& operator=(Vocab&& other) = default; - - Vocab& operator=( - const std::unordered_map& other) { - this->data_ = other; - return *this; - } - - /// \brief Destroy the Vocab and release exclusive resources. - virtual ~Vocab() = default; - - public: - /// \brief Returns the name of the class for type traits. - /// \return The name of the class. - static const char* name() { return "Vocab"; } - - size_t size() const { return data_.size(); } - - void clear() { data_.clear(); } - - void emplace(const std::wstring& key, std::int32_t value) { - data_.emplace(key, value); - } - - std::int32_t at(const std::wstring& key) { return data_.at(key); } - - std::int32_t at(const std::wstring& key) const { return data_.at(key); } - - std::unordered_map::iterator find( - const std::wstring& key) { - return data_.find(key); - } - - std::unordered_map::const_iterator find( - const std::wstring& key) const { - return data_.find(key); - } - - std::unordered_map::iterator begin() { - return data_.begin(); - } - - std::unordered_map::const_iterator begin() const { - return data_.begin(); - } - - std::unordered_map::iterator end() { - return data_.end(); - } - - std::unordered_map::const_iterator end() const { - return data_.end(); - } - - private: - std::unordered_map data_; -}; - -// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI -// Kernel. It can be used when you define a non-tensor type that needs to be -// stored in a vector as PHI kernel argument. - -template <> -struct PhiVectorType { - const char* type_name = "PhiVectorString"; -}; - -using String = std::string; -using Strings = PhiVector; - -// Convert the std::string type to the std::string type. -bool ConvertStrToWstr(const std::string& src, std::wstring* res); -// Convert the std::wstring type to the std::string type. -void ConvertWstrToStr(const std::wstring& src, std::string* res); -// Normalization Form Canonical Decomposition. -void NFD(const std::string& s, std::string* ret); - -// Write the data which is type of -// std::unordered_map to ostream. -void StringMapToStream(std::ostream& os, - const std::unordered_map& data); - -// Read the data which is type of -// std::unordered_map from istream. -void StringMapFromStream(std::istream& is, - std::unordered_map* data); -} // namespace framework -} // namespace paddle +#include "paddle/phi/core/vocab/string_array.h" diff --git a/paddle/fluid/framework/tensor_ref_array.h b/paddle/fluid/framework/tensor_ref_array.h index d5f5e0b61f2f9..80211301b6976 100644 --- a/paddle/fluid/framework/tensor_ref_array.h +++ b/paddle/fluid/framework/tensor_ref_array.h @@ -14,15 +14,17 @@ #pragma once -#include "paddle/fluid/framework/phi_tensor_base_vector.h" - -namespace paddle { -namespace framework { +#include "paddle/phi/core/vocab/phi_tensor_base_vector.h" +namespace phi { template <> -struct PhiVectorType { +struct PhiVectorType { const char* type_name = "VariableRefArray"; }; +} // namespace phi + +namespace paddle { +namespace framework { using VariableRefArray = PhiVector; diff --git a/paddle/fluid/framework/type_info.cc b/paddle/fluid/framework/type_info.cc index daa91dde9d6db..f52db412ac01d 100644 --- a/paddle/fluid/framework/type_info.cc +++ b/paddle/fluid/framework/type_info.cc @@ -39,8 +39,6 @@ bool TypeInfoTraits::classof(const BaseT* obj) { } template class TypeInfoTraits; -template class TypeInfoTraits; -template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index af224dfc5d282..ed077dfb82a1a 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -34,6 +34,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/vocab/string_array.h" COMMON_DECLARE_bool(use_mkldnn); @@ -307,8 +308,14 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature, kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); continue; } else if (input_defs[i].type_index == - std::type_index(typeid( - paddle::optional>))) { + std::type_index( + typeid(paddle::optional)) || + input_defs[i].type_index == + std::type_index(typeid(paddle::optional)) || + input_defs[i].type_index == + std::type_index( + typeid(paddle::optional< + std::vector>))) { kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); @@ -338,6 +345,12 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature, } else if (var.template IsType()) { tensor_in = &(var.template Get()); kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var.template IsType()) { + tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var.template IsType()) { + tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } else { PADDLE_THROW(common::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 6d0bca80b96a5..0944bd7c5773f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -81,8 +81,8 @@ op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_o op_library(run_program_op DEPS executor_cache ${OP_HEADER_DEPS}) target_link_libraries(run_program_op phi common) op_library(quantize_linear_op DEPS phi common) -op_library(save_combine_op DEPS string_array phi common) -op_library(load_combine_op DEPS string_array) +op_library(save_combine_op DEPS phi) +op_library(load_combine_op DEPS phi) op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/operators/ops_signature/faster_tokenizer_sig.cc b/paddle/fluid/operators/ops_signature/faster_tokenizer_sig.cc new file mode 100644 index 0000000000000..0ec3e8ebca806 --- /dev/null +++ b/paddle/fluid/operators/ops_signature/faster_tokenizer_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature FasterTokenizerOpArgumentMapping( + const ArgumentMappingContext& ctx UNUSED) { + return KernelSignature("faster_tokenizer", + {"Vocab", "Text", "TextPair"}, + {"do_lower_case", + "is_split_into_words", + "max_seq_len", + "pad_to_max_seq_len"}, + {"InputIds", "SegmentIds"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(faster_tokenizer, + phi::FasterTokenizerOpArgumentMapping); diff --git a/paddle/fluid/operators/string/CMakeLists.txt b/paddle/fluid/operators/string/CMakeLists.txt index 1da2e8e455da0..2065455e61f42 100644 --- a/paddle/fluid/operators/string/CMakeLists.txt +++ b/paddle/fluid/operators/string/CMakeLists.txt @@ -3,4 +3,4 @@ if(WITH_UNITY_BUILD) # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. include(unity_build_rule.cmake) endif() -register_operators(DEPS op_version_registry utf8proc string_array) +register_operators(DEPS op_version_registry phi) diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc index 10e08e86dc685..c02c3d0752447 100644 --- a/paddle/fluid/operators/string/faster_tokenizer_op.cc +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -9,10 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/string/faster_tokenizer_op.h" - #include - #include #include #include @@ -24,427 +21,12 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/string_array.h" namespace paddle { namespace operators { -using std::ifstream; -using std::int64_t; -using std::size_t; -using std::string; -using std::unordered_map; -using std::unordered_set; -using std::vector; -using std::wstring; - -const wstring kStripChars = L" \t\n\r\v\f"; - -inline bool IsControl(const wchar_t& ch) { - if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false; - auto cat = utf8proc_category(ch); - if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true; - return false; -} - -inline bool IsChineseChar(const wchar_t& ch) { - if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) || - (ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) || - (ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) || - (ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F)) - return true; - return false; -} - -inline bool IsWhiteSpace(const wchar_t& ch) { - if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true; - auto cat = utf8proc_category(ch); - if (cat == UTF8PROC_CATEGORY_ZS) return true; - return false; -} - -inline bool IsPunctuation(const wchar_t& ch) { - if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) || - (ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126)) - return true; - auto cat = utf8proc_category(ch); - if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS || - cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC || - cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO - || cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF) - return true; - return false; -} - -BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */) - : do_lower_case_(do_lower_case) {} - -wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const { - wchar_t new_ch = utf8proc_tolower(ch); - return new_ch; -} - -void BasicTokenizer::Tokenize(const string& text, vector* res) const { - std::wstring unicode_text; - bool status = framework::ConvertStrToWstr(text, &unicode_text); - if (!status) { - // String is converted into wstring failedly. - return; - } - std::wstring cache_text = L""; - auto PushCacheText = [&]() { - if (!cache_text.empty()) { - res->emplace_back(cache_text); - cache_text = L""; - } - }; - for (auto& ch : unicode_text) { - if (ch == 0 || ch == 0xfffd || IsControl(ch)) { - continue; - } - if (do_lower_case_) { - ch = do_lower_case(ch); - } - if (IsChineseChar(ch) || IsPunctuation(ch)) { - PushCacheText(); - res->emplace_back(std::wstring{ch}); - } else if (IsWhiteSpace(ch)) { - PushCacheText(); - } else { - cache_text += ch; - } - } - PushCacheText(); -} - -WordPieceTokenizer::WordPieceTokenizer( - const framework::Vocab* vocab, - const wstring& unk_token /* = L"[UNK]"*/, - const size_t max_input_chars_per_word /* = 100 */) - : vocab_(vocab), - unk_token_(unk_token), - max_input_chars_per_word_(max_input_chars_per_word) { - unk_token_id_ = vocab_->at(unk_token_); -} - -void WordPieceTokenizer::Tokenize(const wstring& text, - vector* token_ids) const { - size_t len = text.size(); - if (len > max_input_chars_per_word_) { - token_ids->emplace_back(unk_token_id_); - return; - } - - auto it = vocab_->find(text); - if (it != vocab_->end()) { - token_ids->emplace_back(it->second); - return; - } - - size_t start = 0; - vector wordpiece_ids; - while (start < len) { - size_t end = len; - std::wstring cur_substr; - int64_t cur_substr_id = 0; - while (start < end) { - std::wstring sub = text.substr(start, end - start); - if (start > 0) { - sub.insert(0, L"##"); - } - auto it = vocab_->find(sub); - if (it != vocab_->end()) { - cur_substr = sub; - cur_substr_id = it->second; - break; - } - end -= 1; - } - - if (cur_substr.empty()) { - token_ids->emplace_back(unk_token_id_); - return; - } else { - start = end; - wordpiece_ids.emplace_back(cur_substr_id); - } - } - for (auto& token_id : wordpiece_ids) { - token_ids->emplace_back(token_id); - } -} - -BertTokenizer::BertTokenizer(const framework::Vocab* vocab, - bool do_lower_case /* = false */, - const wstring& unk_token /* = L"[UNK]" */, - const wstring& pad_token /* = L"[PAD]" */, - const wstring& cls_token /* = L"[CLS]" */, - const wstring& mask_token /* = L"[MASK]" */, - const wstring& sep_token /* = L"[SEP]" */, - const string& padding_site /* = "right" */) - : do_lower_case_(do_lower_case), - unk_token_(unk_token), - pad_token_(pad_token), - cls_token_(cls_token), - mask_token_(mask_token), - sep_token_(sep_token), - padding_site_(padding_site), - vocab_(vocab), - basic_tokenizer_(do_lower_case_), - word_piece_tokenizer_(vocab_, unk_token) { - unk_token_id_ = vocab_->at(unk_token_); - pad_token_id_ = vocab_->at(pad_token_); - cls_token_id_ = vocab_->at(cls_token_); - mask_token_id_ = vocab_->at(mask_token_); - sep_token_id_ = vocab_->at(sep_token_); - - all_special_tokens_ = vector( - {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_}); - all_special_token_ids_ = unordered_set({unk_token_id_, - pad_token_id_, - cls_token_id_, - mask_token_id_, - sep_token_id_}); -} - -void BertTokenizer::Tokenize(const string& text, - vector* split_token_ids) const { - std::vector tmp_tokens; - basic_tokenizer_.Tokenize(text, &tmp_tokens); - if (tmp_tokens.empty()) return; - split_token_ids->reserve(tmp_tokens.size()); - for (auto& w_token : tmp_tokens) { - const auto& vec_size = w_token.size(); - if (vec_size == 1) { - if (IsChineseChar(w_token[0])) { - auto vocab_it = vocab_->find(w_token); - if (vocab_it != vocab_->end()) { - split_token_ids->emplace_back(vocab_it->second); - } else { - split_token_ids->emplace_back(unk_token_id_); - } - } else { - word_piece_tokenizer_.Tokenize(w_token, split_token_ids); - } - } else if (vec_size > 1) { - word_piece_tokenizer_.Tokenize(w_token, split_token_ids); - } else { - continue; - } - } -} - -void BertTokenizer::BuildInputsWithSpecialTokens( - vector* inputs, - const vector& token_ids_0, - const vector& token_ids_1 /* = vector() */) const { - if (token_ids_1.empty()) { - inputs->clear(); - inputs->resize(token_ids_0.size() + 2); - inputs->at(0) = cls_token_id_; - size_t i = 1; - for (auto& token_id : token_ids_0) { - inputs->at(i) = token_id; - ++i; - } - inputs->at(i) = sep_token_id_; - } else { - inputs->clear(); - inputs->resize(token_ids_0.size() + token_ids_1.size() + 3); - inputs->at(0) = cls_token_id_; - size_t i = 1; - for (auto& token_id : token_ids_0) { - inputs->at(i) = token_id; - ++i; - } - inputs->at(i) = sep_token_id_; - ++i; - for (auto& token_id : token_ids_1) { - inputs->at(i) = token_id; - ++i; - } - inputs->at(i) = sep_token_id_; - } -} - -int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const { - if (pair) { - return 3; - } else { - return 2; - } -} - -void BertTokenizer::CreateTokenTypeIdsFromSequences( - vector* token_type_ids, - const vector& token_ids_0, - const vector& token_ids_1 /* = vector() */) const { - if (token_ids_1.empty()) { - vector tmp(token_ids_0.size() + 2, 0); - token_type_ids->swap(tmp); - } else { - vector tmp(token_ids_0.size() + token_ids_1.size() + 3, 0); - for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) { - tmp[i] = 1; - } - token_type_ids->swap(tmp); - } -} - -void BertTokenizer::TruncateSequence( - vector* ids, - vector* pair_ids, - const size_t num_tokens_to_remove /* = 0 */, - const size_t stride /* = 0 */) const { - for (size_t i = 0; i < num_tokens_to_remove; i++) { - if ((pair_ids->empty()) || (ids->size() > pair_ids->size())) { - ids->pop_back(); - } else { - pair_ids->pop_back(); - } - } -} - -int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; } - -int BertTokenizer::Encode( - unordered_map>* encoded_inputs, - const string& text, - const string& text_pair /* = "" */, - bool is_split_into_words /* = false */, - const size_t max_seq_len /* = 0 */, - bool pad_to_max_seq_len /* = false */) const { - vector ids; - vector pair_ids; - if (!is_split_into_words) { - Tokenize(text, &ids); - if (ids.empty()) return 0; - if (!text_pair.empty()) { - Tokenize(text_pair, &pair_ids); - if (pair_ids.empty()) return 0; - } - } else { - std::wstring unicode_text; - bool status_a = framework::ConvertStrToWstr(text, &unicode_text); - if (!status_a) { - return 0; - } - for (size_t i = 0; i < unicode_text.size(); i++) { - wstring token = unicode_text.substr(i, 1); - auto it = vocab_->find(token); - if (it != vocab_->end()) { - ids.emplace_back(it->second); - } else { - ids.emplace_back(unk_token_id_); - } - } - } - - bool pair = false; - if (!pair_ids.empty()) { - pair = true; - } - - size_t len_ids = ids.size(); - size_t len_pair_ids = pair_ids.size(); - - // Truncation: Handle max sequence length - // If max_seq_len == 0, then do nothing and keep the real length. - // If max_seq_len > 0 and - // all the input sequence len is over the max_seq_len, - // then we truncate it. - size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair); - if (max_seq_len > 0 && total_len > max_seq_len) { - TruncateSequence(&ids, &pair_ids, total_len - max_seq_len); - } - - // Add special tokens - vector sequence; - BuildInputsWithSpecialTokens(&sequence, ids, pair_ids); - size_t seq_len = sequence.size(); - vector token_type_ids; - CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids); - - // Build output dictionary - encoded_inputs->emplace("input_ids", sequence); - encoded_inputs->emplace("token_type_ids", token_type_ids); - // Check lengths - if (max_seq_len > 0 && seq_len > max_seq_len) { - VLOG(3) << "There is something wrong with the input sequence length." - " Please check it."; - // Failed. - return 0; - } - - // Padding - bool needs_to_be_padded = false; - if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) { - needs_to_be_padded = true; - } - - if (needs_to_be_padded) { - int64_t difference = static_cast(max_seq_len - seq_len); - size_t pad_start = max_seq_len - 1 - difference; - encoded_inputs->at("token_type_ids").resize(max_seq_len); - for (size_t i = max_seq_len - 1; i > pad_start; i--) { - encoded_inputs->at("token_type_ids")[i] = pad_token_id_; - } - - encoded_inputs->at("input_ids").resize(max_seq_len); - for (size_t i = max_seq_len - 1; i > pad_start; i--) { - encoded_inputs->at("input_ids")[i] = pad_token_id_; - } - } - return 1; -} - -void BertTokenizer::BatchEncode( - vector>>* batch_encode_inputs, - const framework::Strings& batch_text, - const framework::Strings& batch_text_pair /* = vector() */, - bool is_split_into_words /* = false */, - const size_t max_seq_len /* = 0 */, - bool pad_to_max_seq_len /* = false */) const { - bool has_text_pair = false; - if (batch_text_pair.size() != 0) { - has_text_pair = true; - } - - size_t batch_size = batch_text.size(); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (size_t i = 0; i < batch_size; i++) { - unordered_map> res; - if (has_text_pair) { - auto status = Encode(&res, - batch_text[i], - batch_text_pair[i], - is_split_into_words, - max_seq_len, - pad_to_max_seq_len); - if (!status) { - res["input_ids"] = - std::vector{cls_token_id_, sep_token_id_, cls_token_id_}; - res["token_type_ids"] = std::vector{0, 0, 1}; - } - } else { - auto status = Encode(&res, - batch_text[i], - {}, - is_split_into_words, - max_seq_len, - pad_to_max_seq_len); - - if (!status) { - res["input_ids"] = std::vector{cls_token_id_, sep_token_id_}; - res["token_type_ids"] = std::vector{0, 0}; - } - } - batch_encode_inputs->at(i) = std::move(res); - } -} - class FasterTokenizerOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -532,6 +114,3 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp, ops::FasterTokenizerOpMaker); - -PD_REGISTER_STRUCT_KERNEL( - faster_tokenizer, CPU, ALL_LAYOUT, ops::FasterTokenizerKernel, int64_t) {} diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h deleted file mode 100644 index 1f848cb393fae..0000000000000 --- a/paddle/fluid/operators/string/faster_tokenizer_op.h +++ /dev/null @@ -1,210 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/string_array.h" - -namespace paddle { -namespace operators { - -using std::endl; -using std::int64_t; -using std::shared_ptr; -using std::size_t; -using std::string; -using std::unordered_map; -using std::unordered_set; -using std::vector; -using std::wcout; -using std::wstring; - -inline bool IsControl(const wchar_t& ch); -inline bool IsChineseChar(const wchar_t& ch); -inline bool IsWhiteSpace(const wchar_t& ch); - -using Vocab = unordered_map; -using InvVocab = unordered_map; - -class BasicTokenizer { - public: - explicit BasicTokenizer(bool do_lower_case = true); - void Tokenize(const string& text, vector* res) const; - - private: - wchar_t do_lower_case(wchar_t ch) const; - - bool do_lower_case_; -}; - -class WordPieceTokenizer { - public: - explicit WordPieceTokenizer(const framework::Vocab* vocab, - const wstring& unk_token = L"[UNK]", - const size_t max_input_chars_per_word = 100); - void Tokenize(const wstring& text, vector* output) const; - - private: - const framework::Vocab* vocab_; - wstring unk_token_{L"[UNK]"}; - int64_t unk_token_id_; - size_t max_input_chars_per_word_; -}; - -class BertTokenizer { - public: - explicit BertTokenizer(const framework::Vocab* vocab, - bool do_lower_case = false, - const wstring& unk_token = L"[UNK]", - const wstring& pad_token = L"[PAD]", - const wstring& cls_token = L"[CLS]", - const wstring& mask_token = L"[MASK]", - const wstring& sep_token = L"[SEP]", - const string& padding_site = "right"); - - void Tokenize(const string& text, vector* split_tokens) const; - void BuildInputsWithSpecialTokens( - vector* res, - const vector& token_ids_0, - const vector& token_ids_1 = vector()) const; - void CreateTokenTypeIdsFromSequences( - vector* token_type_ids, - const vector& token_ids_0, - const vector& token_ids_1 = vector()) const; - void TruncateSequence(vector* ids, - vector* pair_ids, - const size_t num_tokens_to_remove = 0, - const size_t stride = 0) const; - int64_t GetNumSpecialTokensToAdd(const bool pair = false) const; - int Encode(unordered_map>* encoded_inputs, - const string& text, - const string& text_pair = "", - bool is_split_into_words = false, - const size_t max_seq_len = 0, - bool pad_to_max_seq_len = false) const; - void BatchEncode( - vector>>* batch_encode_inputs, - const framework::Strings& batch_text, - const framework::Strings& batch_text_pair = framework::Strings(), - bool is_split_into_words = false, - const size_t max_seq_len = 0, - bool pad_to_max_seq_len = false) const; - - int64_t GetPadTokenID() const; - - private: - bool do_lower_case_; - wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_; - string padding_site_; - const framework::Vocab* vocab_; - BasicTokenizer basic_tokenizer_; - WordPieceTokenizer word_piece_tokenizer_; - int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_, - sep_token_id_; - vector all_special_tokens_; - unordered_set all_special_token_ids_; - InvVocab inv_vocab_; -}; - -template -class FasterTokenizerKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* text = ctx.Input("Text"); - auto* vocab = ctx.Input("Vocab"); - - auto* input_ids = ctx.Output("InputIds"); - auto* seg_ids = ctx.Output("SegmentIds"); - - auto do_lower_case = static_cast(ctx.Attr("do_lower_case")); - auto is_split_into_words = - static_cast(ctx.Attr("is_split_into_words")); - auto max_seq_len = static_cast(ctx.Attr("max_seq_len")); - auto pad_to_max_seq_len = - static_cast(ctx.Attr("pad_to_max_seq_len")); - - auto* text_pair = ctx.Input("TextPair"); - if (text_pair && text->size() != text_pair->size()) { - VLOG(3) << "The input text(list[str]) and text pair (list[str]) must" - << "be the same number of text sequence. Please check the input!"; - return; - } - - BertTokenizer tokenizer(vocab, do_lower_case); - size_t batch_max_seq_len = 0; - size_t batch_size = text->size(); - - vector>> batch_encode_inputs( - batch_size); - if (text_pair) { - tokenizer.BatchEncode(&batch_encode_inputs, - *text, - *text_pair, - is_split_into_words, - max_seq_len, - pad_to_max_seq_len); - } else { - tokenizer.BatchEncode(&batch_encode_inputs, - *text, - framework::Strings(), - is_split_into_words, - max_seq_len, - pad_to_max_seq_len); - } - - for (size_t i = 0; i < batch_size; ++i) { - size_t seq_len = batch_encode_inputs[i]["input_ids"].size(); - if (seq_len > batch_max_seq_len) { - batch_max_seq_len = seq_len; - } - } - - input_ids->Resize( - common::make_ddim({static_cast(batch_size), - static_cast(batch_max_seq_len)})); - auto* input_ids_data = input_ids->mutable_data(ctx.GetPlace()); - seg_ids->Resize( - common::make_ddim({static_cast(batch_size), - static_cast(batch_max_seq_len)})); - auto* seg_ids_data = seg_ids->mutable_data(ctx.GetPlace()); - - auto pad_token_id = tokenizer.GetPadTokenID(); - for (size_t i = 0; i < batch_size; i++) { - auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"]; - auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"]; - const size_t& seq_len = encoder_input_ids.size(); - // Copy the memory - std::memcpy(input_ids_data + i * batch_max_seq_len, - encoder_input_ids.data(), - seq_len * sizeof(T)); - std::memcpy(seg_ids_data + i * batch_max_seq_len, - encoder_seg_ids.data(), - seq_len * sizeof(T)); - std::memset(input_ids_data + i * batch_max_seq_len + seq_len, - pad_token_id, - (batch_max_seq_len - seq_len) * sizeof(T)); - std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, - pad_token_id, - (batch_max_seq_len - seq_len) * sizeof(T)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index 3e33a77002957..a759d7f4ac1e2 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -176,6 +176,7 @@ 'dgc', 'dpsgd', 'embedding_grad_sparse', + 'faster_tokenizer', 'ftrl', 'fused_adam_', 'fused_batch_norm_act_', diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 5e8ff5d5fc2ef..39045a10ff3c1 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(distributed) add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +add_subdirectory(vocab) if(WITH_GPU) proto_library(external_error_proto SRCS external_error.proto) diff --git a/paddle/phi/core/kernel_registry.cc b/paddle/phi/core/kernel_registry.cc index 172ad23e9302f..0cc66aafd7be9 100644 --- a/paddle/phi/core/kernel_registry.cc +++ b/paddle/phi/core/kernel_registry.cc @@ -19,6 +19,7 @@ #include "paddle/phi/core/custom_kernel.h" #include "paddle/phi/core/kernel_utils.h" +#include "paddle/phi/core/vocab/string_array.h" namespace phi { @@ -88,6 +89,13 @@ void SetKernelArgsDef(const std::vector& args_type, default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == + std::type_index(typeid( + const paddle::optional&))) { // NOLINT + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid( const std::vector&))) { // NOLINT @@ -95,6 +103,12 @@ void SetKernelArgsDef(const std::vector& args_type, default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid( + const paddle::optional&))) { // NOLINT + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid( const std::vector&))) { // NOLINT diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 801a69498b4c9..d6fdc7cb80a4a 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -27,6 +27,7 @@ #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/string_tensor.h" #include "paddle/phi/core/tensor_array.h" +#include "paddle/phi/core/vocab/string_array.h" namespace phi { @@ -319,6 +320,7 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(ExtendedTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(ExtendedTensor); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(ExtendedTensor); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(TensorBase); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SelectedRows); @@ -340,6 +342,9 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(TensorArray); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(TensorArray); + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(phi::Strings); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(phi::Strings); + /* Attribute Helpers */ PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool); diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc index b419338401eea..fe9878d685412 100644 --- a/paddle/phi/core/utils/type_info.cc +++ b/paddle/phi/core/utils/type_info.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/phi/core/string_tensor.h" #include "paddle/phi/core/tensor_array.h" #include "paddle/phi/core/utils/type_info.h" +#include "paddle/phi/core/vocab/string_array.h" namespace phi { @@ -50,6 +51,8 @@ template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; +template class TypeInfoTraits; +template class TypeInfoTraits; template class TypeInfoTraits; template class TypeInfoTraits; diff --git a/paddle/phi/core/vocab/CMakeLists.txt b/paddle/phi/core/vocab/CMakeLists.txt new file mode 100644 index 0000000000000..d0b065227d154 --- /dev/null +++ b/paddle/phi/core/vocab/CMakeLists.txt @@ -0,0 +1 @@ +collect_srcs(core_srcs SRCS string_array.cc) diff --git a/paddle/fluid/framework/phi_tensor_base_vector.h b/paddle/phi/core/vocab/phi_tensor_base_vector.h similarity index 92% rename from paddle/fluid/framework/phi_tensor_base_vector.h rename to paddle/phi/core/vocab/phi_tensor_base_vector.h index 1d775383de809..f2389ba482682 100644 --- a/paddle/fluid/framework/phi_tensor_base_vector.h +++ b/paddle/phi/core/vocab/phi_tensor_base_vector.h @@ -1,4 +1,4 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,8 +20,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/extended_tensor.h" -namespace paddle { -namespace framework { +namespace phi { template struct PhiVectorType; @@ -97,5 +96,14 @@ class PhiVector : public phi::ExtendedTensor, std::vector data_; }; +} // namespace phi + +namespace paddle { +namespace framework { +template +using PhiVector = phi::PhiVector; + +template +using PhiVectorType = phi::PhiVectorType; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/string_array.cc b/paddle/phi/core/vocab/string_array.cc similarity index 78% rename from paddle/fluid/framework/string_array.cc rename to paddle/phi/core/vocab/string_array.cc index 96aa8d04988aa..4a9b8df9439fc 100644 --- a/paddle/fluid/framework/string_array.cc +++ b/paddle/phi/core/vocab/string_array.cc @@ -1,26 +1,23 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/string_array.h" - +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/vocab/string_array.h" #include - #include - #include "glog/logging.h" -namespace paddle::framework { +namespace phi { std::wstring_convert> kConverter; @@ -100,4 +97,4 @@ void StringMapFromStream(std::istream& is, } } -} // namespace paddle::framework +} // namespace phi diff --git a/paddle/phi/core/vocab/string_array.h b/paddle/phi/core/vocab/string_array.h new file mode 100644 index 0000000000000..73cdcfd793470 --- /dev/null +++ b/paddle/phi/core/vocab/string_array.h @@ -0,0 +1,142 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/extended_tensor.h" +#include "paddle/phi/core/vocab/phi_tensor_base_vector.h" + +namespace phi { +template <> +struct PhiVectorType { + const char* type_name = "PhiVectorString"; +}; + +// Note(YuanRisheng): Vocab is mainly used for faster_tokenizer_op and we don't +// recommend widely use it. Because faster_tokenizer_op may be deleted in the +// future and this class will be deleted. + +class Vocab : public phi::ExtendedTensor, + public phi::TypeInfoTraits { + public: + Vocab() = default; + + Vocab(Vocab&& other) = default; + + Vocab(const Vocab& other) = default; + + Vocab& operator=(const Vocab& other) = default; + + Vocab& operator=(Vocab&& other) = default; + + Vocab& operator=( + const std::unordered_map& other) { + this->data_ = other; + return *this; + } + + /// \brief Destroy the Vocab and release exclusive resources. + virtual ~Vocab() = default; + + public: + /// \brief Returns the name of the class for type traits. + /// \return The name of the class. + static const char* name() { return "Vocab"; } + + size_t size() const { return data_.size(); } + + void clear() { data_.clear(); } + + void emplace(const std::wstring& key, std::int32_t value) { + data_.emplace(key, value); + } + + std::int32_t at(const std::wstring& key) { return data_.at(key); } + + std::int32_t at(const std::wstring& key) const { return data_.at(key); } + + std::unordered_map::iterator find( + const std::wstring& key) { + return data_.find(key); + } + + std::unordered_map::const_iterator find( + const std::wstring& key) const { + return data_.find(key); + } + + std::unordered_map::iterator begin() { + return data_.begin(); + } + + std::unordered_map::const_iterator begin() const { + return data_.begin(); + } + + std::unordered_map::iterator end() { + return data_.end(); + } + + std::unordered_map::const_iterator end() const { + return data_.end(); + } + + private: + std::unordered_map data_; +}; + +// Note(YuanRisheng): PhiVector is essentially a vector that only used for PHI +// Kernel. It can be used when you define a non-tensor type that needs to be +// stored in a vector as PHI kernel argument. + +using String = std::string; +using Strings = PhiVector; + +// Convert the std::string type to the std::string type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res); +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res); +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret); + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data); + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data); +} // namespace phi + +namespace paddle { +namespace framework { +using Vocab = phi::Vocab; +using Strings = phi::Strings; +using String = phi::String; +using phi::ConvertStrToWstr; +using phi::ConvertWstrToStr; +using phi::NFD; +using phi::StringMapFromStream; +using phi::StringMapToStream; +} // namespace framework +} // namespace paddle diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 5925634f8d87c..c9282b43d4e5e 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -742,6 +742,22 @@ void InstanceNormInferMeta(const MetaTensor& x, } } +void FasterTokenizerInferMeta(const MetaTensor& vocab, + const MetaTensor& text, + const MetaTensor& text_pair, + bool do_lower_case, + bool is_split_into_words, + int max_seq_len, + bool pad_to_max_seq_len, + MetaTensor* input_ids, + MetaTensor* segment_ids, + MetaConfig config) { + input_ids->set_dims({-1, -1}); + segment_ids->set_dims({-1, -1}); + input_ids->set_dtype(phi::DataType::INT64); + segment_ids->set_dtype(phi::DataType::INT64); +} + void GlobalGatherInferMeta(const MetaTensor& x, const MetaTensor& local_count, const MetaTensor& global_count, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index f5f6307a8fa0d..a86e06239d518 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -152,6 +152,17 @@ void InstanceNormInferMeta(const MetaTensor& x, MetaTensor* saved_variance, MetaConfig config = MetaConfig()); +void FasterTokenizerInferMeta(const MetaTensor& vocab, + const MetaTensor& text, + const MetaTensor& text_pair, + bool do_lower_case, + bool is_split_into_words, + int max_seq_len, + bool pad_to_max_seq_len, + MetaTensor* input_ids, + MetaTensor* segment_ids, + MetaConfig config = MetaConfig()); + void GlobalGatherInferMeta(const MetaTensor& x, const MetaTensor& local_count, const MetaTensor& global_count, diff --git a/paddle/phi/kernels/cpu/faster_tokenizer_kernel.cc b/paddle/phi/kernels/cpu/faster_tokenizer_kernel.cc new file mode 100644 index 0000000000000..e27db0c181ac6 --- /dev/null +++ b/paddle/phi/kernels/cpu/faster_tokenizer_kernel.cc @@ -0,0 +1,617 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/vocab/string_array.h" + +namespace phi { + +using std::endl; +using std::ifstream; +using std::int64_t; +using std::shared_ptr; +using std::size_t; +using std::string; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using std::wcout; +using std::wstring; +using Strings = paddle::framework::Strings; + +inline bool IsControl(const wchar_t& ch); +inline bool IsChineseChar(const wchar_t& ch); +inline bool IsWhiteSpace(const wchar_t& ch); + +using InvVocab = unordered_map; + +class BasicTokenizer { + public: + explicit BasicTokenizer(bool do_lower_case = true); + void Tokenize(const string& text, vector* res) const; + + private: + wchar_t do_lower_case(wchar_t ch) const; + + bool do_lower_case_; +}; + +class WordPieceTokenizer { + public: + explicit WordPieceTokenizer(const paddle::framework::Vocab* vocab, + const wstring& unk_token = L"[UNK]", + const size_t max_input_chars_per_word = 100); + void Tokenize(const wstring& text, vector* output) const; + + private: + const paddle::framework::Vocab* vocab_; + wstring unk_token_{L"[UNK]"}; + int64_t unk_token_id_; + size_t max_input_chars_per_word_; +}; + +class BertTokenizer { + public: + explicit BertTokenizer(const paddle::framework::Vocab* vocab, + bool do_lower_case = false, + const wstring& unk_token = L"[UNK]", + const wstring& pad_token = L"[PAD]", + const wstring& cls_token = L"[CLS]", + const wstring& mask_token = L"[MASK]", + const wstring& sep_token = L"[SEP]", + const string& padding_site = "right"); + + void Tokenize(const string& text, vector* split_tokens) const; + void BuildInputsWithSpecialTokens( + vector* res, + const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void CreateTokenTypeIdsFromSequences( + vector* token_type_ids, + const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void TruncateSequence(vector* ids, + vector* pair_ids, + const size_t num_tokens_to_remove = 0, + const size_t stride = 0) const; + int64_t GetNumSpecialTokensToAdd(const bool pair = false) const; + int Encode(unordered_map>* encoded_inputs, + const string& text, + const string& text_pair = "", + bool is_split_into_words = false, + const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + void BatchEncode( + vector>>* batch_encode_inputs, + const Strings& batch_text, + const Strings& batch_text_pair = Strings(), + bool is_split_into_words = false, + const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + + int64_t GetPadTokenID() const; + + private: + bool do_lower_case_; + wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_; + string padding_site_; + const paddle::framework::Vocab* vocab_; + BasicTokenizer basic_tokenizer_; + WordPieceTokenizer word_piece_tokenizer_; + int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_, + sep_token_id_; + vector all_special_tokens_; + unordered_set all_special_token_ids_; + InvVocab inv_vocab_; +}; + +const wstring kStripChars = L" \t\n\r\v\f"; + +inline bool IsControl(const wchar_t& ch) { + if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true; + return false; +} + +inline bool IsChineseChar(const wchar_t& ch) { + if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) || + (ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) || + (ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) || + (ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F)) + return true; + return false; +} + +inline bool IsWhiteSpace(const wchar_t& ch) { + if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_ZS) return true; + return false; +} + +inline bool IsPunctuation(const wchar_t& ch) { + if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) || + (ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126)) + return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS || + cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO + || cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF) + return true; + return false; +} + +BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */) + : do_lower_case_(do_lower_case) {} + +wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const { + wchar_t new_ch = utf8proc_tolower(ch); + return new_ch; +} + +void BasicTokenizer::Tokenize(const string& text, vector* res) const { + std::wstring unicode_text; + bool status = phi::ConvertStrToWstr(text, &unicode_text); + if (!status) { + // String is converted into wstring failedly. + return; + } + std::wstring cache_text = L""; + auto PushCacheText = [&]() { + if (!cache_text.empty()) { + res->emplace_back(cache_text); + cache_text = L""; + } + }; + for (auto& ch : unicode_text) { + if (ch == 0 || ch == 0xfffd || IsControl(ch)) { + continue; + } + if (do_lower_case_) { + ch = do_lower_case(ch); + } + if (IsChineseChar(ch) || IsPunctuation(ch)) { + PushCacheText(); + res->emplace_back(std::wstring{ch}); + } else if (IsWhiteSpace(ch)) { + PushCacheText(); + } else { + cache_text += ch; + } + } + PushCacheText(); +} + +WordPieceTokenizer::WordPieceTokenizer( + const paddle::framework::Vocab* vocab, + const wstring& unk_token /* = L"[UNK]"*/, + const size_t max_input_chars_per_word /* = 100 */) + : vocab_(vocab), + unk_token_(unk_token), + max_input_chars_per_word_(max_input_chars_per_word) { + unk_token_id_ = vocab_->at(unk_token_); +} + +void WordPieceTokenizer::Tokenize(const wstring& text, + vector* token_ids) const { + size_t len = text.size(); + if (len > max_input_chars_per_word_) { + token_ids->emplace_back(unk_token_id_); + return; + } + + auto it = vocab_->find(text); + if (it != vocab_->end()) { + token_ids->emplace_back(it->second); + return; + } + + size_t start = 0; + vector wordpiece_ids; + while (start < len) { + size_t end = len; + std::wstring cur_substr; + int64_t cur_substr_id = 0; + while (start < end) { + std::wstring sub = text.substr(start, end - start); + if (start > 0) { + sub.insert(0, L"##"); + } + auto it = vocab_->find(sub); + if (it != vocab_->end()) { + cur_substr = sub; + cur_substr_id = it->second; + break; + } + end -= 1; + } + + if (cur_substr.empty()) { + token_ids->emplace_back(unk_token_id_); + return; + } else { + start = end; + wordpiece_ids.emplace_back(cur_substr_id); + } + } + for (auto& token_id : wordpiece_ids) { + token_ids->emplace_back(token_id); + } +} + +BertTokenizer::BertTokenizer(const paddle::framework::Vocab* vocab, + bool do_lower_case /* = false */, + const wstring& unk_token /* = L"[UNK]" */, + const wstring& pad_token /* = L"[PAD]" */, + const wstring& cls_token /* = L"[CLS]" */, + const wstring& mask_token /* = L"[MASK]" */, + const wstring& sep_token /* = L"[SEP]" */, + const string& padding_site /* = "right" */) + : do_lower_case_(do_lower_case), + unk_token_(unk_token), + pad_token_(pad_token), + cls_token_(cls_token), + mask_token_(mask_token), + sep_token_(sep_token), + padding_site_(padding_site), + vocab_(vocab), + basic_tokenizer_(do_lower_case_), + word_piece_tokenizer_(vocab_, unk_token) { + unk_token_id_ = vocab_->at(unk_token_); + pad_token_id_ = vocab_->at(pad_token_); + cls_token_id_ = vocab_->at(cls_token_); + mask_token_id_ = vocab_->at(mask_token_); + sep_token_id_ = vocab_->at(sep_token_); + + all_special_tokens_ = vector( + {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_}); + all_special_token_ids_ = unordered_set({unk_token_id_, + pad_token_id_, + cls_token_id_, + mask_token_id_, + sep_token_id_}); +} + +void BertTokenizer::Tokenize(const string& text, + vector* split_token_ids) const { + std::vector tmp_tokens; + basic_tokenizer_.Tokenize(text, &tmp_tokens); + if (tmp_tokens.empty()) return; + split_token_ids->reserve(tmp_tokens.size()); + for (auto& w_token : tmp_tokens) { + const auto& vec_size = w_token.size(); + if (vec_size == 1) { + if (IsChineseChar(w_token[0])) { + auto vocab_it = vocab_->find(w_token); + if (vocab_it != vocab_->end()) { + split_token_ids->emplace_back(vocab_it->second); + } else { + split_token_ids->emplace_back(unk_token_id_); + } + } else { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } + } else if (vec_size > 1) { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } else { + continue; + } + } +} + +void BertTokenizer::BuildInputsWithSpecialTokens( + vector* inputs, + const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.empty()) { + inputs->clear(); + inputs->resize(token_ids_0.size() + 2); + inputs->at(0) = cls_token_id_; + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = token_id; + ++i; + } + inputs->at(i) = sep_token_id_; + } else { + inputs->clear(); + inputs->resize(token_ids_0.size() + token_ids_1.size() + 3); + inputs->at(0) = cls_token_id_; + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = token_id; + ++i; + } + inputs->at(i) = sep_token_id_; + ++i; + for (auto& token_id : token_ids_1) { + inputs->at(i) = token_id; + ++i; + } + inputs->at(i) = sep_token_id_; + } +} + +int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const { + if (pair) { + return 3; + } else { + return 2; + } +} + +void BertTokenizer::CreateTokenTypeIdsFromSequences( + vector* token_type_ids, + const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.empty()) { + vector tmp(token_ids_0.size() + 2, 0); + token_type_ids->swap(tmp); + } else { + vector tmp(token_ids_0.size() + token_ids_1.size() + 3, 0); + for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) { + tmp[i] = 1; + } + token_type_ids->swap(tmp); + } +} + +void BertTokenizer::TruncateSequence( + vector* ids, + vector* pair_ids, + const size_t num_tokens_to_remove /* = 0 */, + const size_t stride /* = 0 */) const { + for (size_t i = 0; i < num_tokens_to_remove; i++) { + if ((pair_ids->empty()) || (ids->size() > pair_ids->size())) { + ids->pop_back(); + } else { + pair_ids->pop_back(); + } + } +} + +int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; } + +int BertTokenizer::Encode( + unordered_map>* encoded_inputs, + const string& text, + const string& text_pair /* = "" */, + bool is_split_into_words /* = false */, + const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + vector ids; + vector pair_ids; + if (!is_split_into_words) { + Tokenize(text, &ids); + if (ids.empty()) return 0; + if (!text_pair.empty()) { + Tokenize(text_pair, &pair_ids); + if (pair_ids.empty()) return 0; + } + } else { + std::wstring unicode_text; + bool status_a = phi::ConvertStrToWstr(text, &unicode_text); + if (!status_a) { + return 0; + } + for (size_t i = 0; i < unicode_text.size(); i++) { + wstring token = unicode_text.substr(i, 1); + auto it = vocab_->find(token); + if (it != vocab_->end()) { + ids.emplace_back(it->second); + } else { + ids.emplace_back(unk_token_id_); + } + } + } + + bool pair = false; + if (!pair_ids.empty()) { + pair = true; + } + + size_t len_ids = ids.size(); + size_t len_pair_ids = pair_ids.size(); + + // Truncation: Handle max sequence length + // If max_seq_len == 0, then do nothing and keep the real length. + // If max_seq_len > 0 and + // all the input sequence len is over the max_seq_len, + // then we truncate it. + size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair); + if (max_seq_len > 0 && total_len > max_seq_len) { + TruncateSequence(&ids, &pair_ids, total_len - max_seq_len); + } + + // Add special tokens + vector sequence; + BuildInputsWithSpecialTokens(&sequence, ids, pair_ids); + size_t seq_len = sequence.size(); + vector token_type_ids; + CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids); + + // Build output dictionary + encoded_inputs->emplace("input_ids", sequence); + encoded_inputs->emplace("token_type_ids", token_type_ids); + // Check lengths + if (max_seq_len > 0 && seq_len > max_seq_len) { + VLOG(3) << "There is something wrong with the input sequence length." + " Please check it."; + // Failed. + return 0; + } + + // Padding + bool needs_to_be_padded = false; + if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) { + needs_to_be_padded = true; + } + + if (needs_to_be_padded) { + int64_t difference = static_cast(max_seq_len - seq_len); + size_t pad_start = max_seq_len - 1 - difference; + encoded_inputs->at("token_type_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("token_type_ids")[i] = pad_token_id_; + } + + encoded_inputs->at("input_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("input_ids")[i] = pad_token_id_; + } + } + return 1; +} + +void BertTokenizer::BatchEncode( + vector>>* batch_encode_inputs, + const Strings& batch_text, + const Strings& batch_text_pair /* = vector() */, + bool is_split_into_words /* = false */, + const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + bool has_text_pair = false; + if (batch_text_pair.size() != 0) { + has_text_pair = true; + } + + size_t batch_size = batch_text.size(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < batch_size; i++) { + unordered_map> res; + if (has_text_pair) { + auto status = Encode(&res, + batch_text[i], + batch_text_pair[i], + is_split_into_words, + max_seq_len, + pad_to_max_seq_len); + if (!status) { + res["input_ids"] = + std::vector{cls_token_id_, sep_token_id_, cls_token_id_}; + res["token_type_ids"] = std::vector{0, 0, 1}; + } + } else { + auto status = Encode(&res, + batch_text[i], + {}, + is_split_into_words, + max_seq_len, + pad_to_max_seq_len); + + if (!status) { + res["input_ids"] = std::vector{cls_token_id_, sep_token_id_}; + res["token_type_ids"] = std::vector{0, 0}; + } + } + batch_encode_inputs->at(i) = std::move(res); + } +} + +template +void FasterTokenizerKernel(const Context& dev_ctx, + const phi::ExtendedTensor& vocab_in, + const phi::ExtendedTensor& text_in, + const paddle::optional& text_pair_in, + bool do_lower_case, + bool is_split_into_words, + int max_seq_len, + bool pad_to_max_seq_len, + DenseTensor* input_ids, + DenseTensor* segment_ids) { + const auto* vocab = + reinterpret_cast(&vocab_in); + const auto* text = reinterpret_cast(&text_in); + const auto* text_pair = + reinterpret_cast(text_pair_in.get_ptr()); + auto* seg_ids = segment_ids; + if (text_pair && text->size() != text_pair->size()) { + VLOG(3) << "The input text(list[str]) and text pair (list[str]) must" + << "be the same number of text sequence. Please check the input!"; + return; + } + + BertTokenizer tokenizer(vocab, do_lower_case); + size_t batch_max_seq_len = 0; + size_t batch_size = text->size(); + + vector>> batch_encode_inputs( + batch_size); + if (text_pair) { + tokenizer.BatchEncode(&batch_encode_inputs, + *text, + *text_pair, + is_split_into_words, + max_seq_len, + pad_to_max_seq_len); + } else { + tokenizer.BatchEncode(&batch_encode_inputs, + *text, + Strings(), + is_split_into_words, + max_seq_len, + pad_to_max_seq_len); + } + + for (size_t i = 0; i < batch_size; ++i) { + size_t seq_len = batch_encode_inputs[i]["input_ids"].size(); + if (seq_len > batch_max_seq_len) { + batch_max_seq_len = seq_len; + } + } + + input_ids->Resize( + common::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* input_ids_data = dev_ctx.template Alloc(input_ids); + seg_ids->Resize(common::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* seg_ids_data = dev_ctx.template Alloc(seg_ids); + + auto pad_token_id = tokenizer.GetPadTokenID(); + for (size_t i = 0; i < batch_size; i++) { + auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"]; + auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"]; + const size_t& seq_len = encoder_input_ids.size(); + // Copy the memory + std::memcpy(input_ids_data + i * batch_max_seq_len, + encoder_input_ids.data(), + seq_len * sizeof(T)); + std::memcpy(seg_ids_data + i * batch_max_seq_len, + encoder_seg_ids.data(), + seq_len * sizeof(T)); + std::memset(input_ids_data + i * batch_max_seq_len + seq_len, + pad_token_id, + (batch_max_seq_len - seq_len) * sizeof(T)); + std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, + pad_token_id, + (batch_max_seq_len - seq_len) * sizeof(T)); + } +} +} // namespace phi + +PD_REGISTER_KERNEL( + faster_tokenizer, CPU, ALL_LAYOUT, phi::FasterTokenizerKernel, int64_t) {} diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index 259399b78ebce..2d6fdb3b44d0f 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -1029,6 +1029,17 @@ data_type : logits backward: c_softmax_with_cross_entropy_grad +- op: faster_tokenizer + args: (Tensor vocab, Tensor text, Tensor text_pair, bool do_lower_case = false, + bool is_split_into_words = false, int max_seq_len = 0, bool pad_to_max_seq_len + = false) + output: Tensor (input_ids), Tensor (segment_ids) + infer_meta: + func: FasterTokenizerInferMeta + kernel: + func: faster_tokenizer + optional: text_pair + - op: fused_attention args: (Tensor x, Tensor ln_scale, Tensor ln_bias, Tensor qkv_weight, Tensor qkv_bias, Tensor cache_kv, Tensor src_mask, Tensor out_linear_weight, Tensor out_linear_bias, Tensor ln_scale_2, Tensor ln_bias_2, int num_heads, bool transpose_qkv_wb, bool pre_layer_norm, float epsilon, float attn_dropout_rate, bool is_test, bool attn_dropout_fix_seed, int attn_dropout_seed, str attn_dropout_implementation, float dropout_rate, bool dropout_fix_seed, int dropout_seed, str dropout_implementation, float ln_epsilon, bool add_residual, int ring_id) output: Tensor(ln_mean), Tensor(ln_var), Tensor(ln_out), Tensor(qkv_out), Tensor(qkv_bias_out), Tensor(transpose_out_2), Tensor(qk_out), Tensor(qktv_out), Tensor(softmax_out), Tensor(attn_dropout_mask_out), Tensor(attn_dropout_out), Tensor(src_mask_out), Tensor(fmha_out), Tensor(out_linear_out), Tensor(dropout_mask_out), Tensor(ln_mean_2), Tensor(ln_var_2), Tensor(bias_dropout_residual_out), Tensor(cache_kv_out), Tensor(out) diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index fbf5d8f15f2a9..899a43d6e8287 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -1249,6 +1249,12 @@ out_scale : OutScale out_scales : OutScales +- op : faster_tokenizer + inputs: + {vocab : Vocab, text : Text, text_pair : TextPair} + outputs: + {input_ids : InputIds, segment_ids : SegmentIds} + - op : fc inputs : input : Input From d1711fa53ea6e8a23b7f345d96ef3998dfc3fdad Mon Sep 17 00:00:00 2001 From: co63oc Date: Tue, 15 Oct 2024 13:11:01 +0800 Subject: [PATCH 130/135] Clean framework_proto in cmake [fluid_ops] (#68331) * Fix * Fix * Fix * Fix * Fix --- .../distributed/ps/wrapper/CMakeLists.txt | 3 +- paddle/fluid/framework/CMakeLists.txt | 94 ++++--------------- paddle/fluid/framework/details/CMakeLists.txt | 3 +- .../framework/new_executor/CMakeLists.txt | 5 +- .../fluid/inference/tensorrt/CMakeLists.txt | 15 ++- .../inference/tensorrt/convert/CMakeLists.txt | 2 +- paddle/fluid/operators/pscore/CMakeLists.txt | 8 +- paddle/fluid/platform/CMakeLists.txt | 13 +-- paddle/fluid/platform/profiler/CMakeLists.txt | 2 +- paddle/testing/CMakeLists.txt | 1 - test/cpp/fluid/framework/CMakeLists.txt | 2 +- 11 files changed, 35 insertions(+), 113 deletions(-) diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt index d2095dfe26853..feeb04abb2393 100644 --- a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt @@ -5,8 +5,7 @@ set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS cc_library( fleet SRCS fleet.cc - DEPS framework_proto - ps_framework_proto + DEPS ps_framework_proto ps_service variable_helper scope diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b2e5e539c9bd8..db84d9e23c9f9 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -104,7 +104,7 @@ file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}") cc_library( data_type SRCS data_type.cc - DEPS framework_proto) + DEPS phi) cc_library( tensor @@ -114,7 +114,7 @@ cc_library( cc_library( lod_tensor SRCS lod_tensor.cc - DEPS phi common tensor framework_proto version) + DEPS phi common tensor version) cc_library( garbage_collector @@ -129,7 +129,7 @@ cc_library( cc_library( var_type_traits SRCS var_type_traits.cc - DEPS framework_proto scope phi common) + DEPS scope phi common) # every source file that includes "dnnl.h" must depends on onednn # or, the first one should depends on onednn @@ -159,7 +159,7 @@ cc_library( cc_library( device_worker SRCS device_worker.cc - DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS}) + DEPS lod_tensor scope ${BRPC_DEPS}) cc_library( scope_pool SRCS scope_pool.cc @@ -212,7 +212,6 @@ cc_library( data_transform SRCS data_transform.cc DEPS tensor - framework_proto selected_rows_utils data_device_transform data_type_transform @@ -223,21 +222,21 @@ cc_library( cc_library( attribute SRCS attribute.cc - DEPS framework_proto phi common) + DEPS phi common) cc_library( op_version_proto SRCS op_version_proto.cc - DEPS framework_proto) + DEPS phi) cc_library( op_version_registry SRCS op_version_registry.cc - DEPS op_version_proto framework_proto) + DEPS op_version_proto phi) cc_library( op_proto_maker SRCS op_proto_maker.cc - DEPS framework_proto attribute ops_extra_info glog auto_parallel_proto) + DEPS phi attribute ops_extra_info glog auto_parallel_proto) cc_library( no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc @@ -245,7 +244,7 @@ cc_library( cc_library( op_info SRCS op_info.cc - DEPS attribute framework_proto no_need_buffer_vars_inference) + DEPS attribute phi no_need_buffer_vars_inference) cc_library( shape_inference SRCS shape_inference.cc @@ -257,16 +256,6 @@ if(WITH_ONEDNN) add_dependencies(shape_inference onednn) endif() -cc_library( - transfer_scope_cache - SRCS transfer_scope_cache.cc - DEPS scope framework_proto device_context) - -cc_library( - unused_var_check - SRCS unused_var_check.cc - DEPS glog no_need_buffer_vars_inference) - cc_library( op_kernel_type SRCS op_kernel_type.cc @@ -293,26 +282,21 @@ endif() if(WITH_XPU) cc_library( operator - SRCS operator.cc + SRCS operator.cc transfer_scope_cache.cc unused_var_check.cc + infershape_utils.cc DEPS xpu_op_list op_info proto_desc - device_context tensor scope glog - trainer_desc_proto - data_feed_proto shape_inference data_transform lod_tensor - transfer_scope_cache op_kernel_type op_call_stack - unused_var_check detail_op_handle phi_utils - infershape_utils phi common op_compat_infos @@ -320,25 +304,20 @@ if(WITH_XPU) else() cc_library( operator - SRCS operator.cc + SRCS operator.cc transfer_scope_cache.cc unused_var_check.cc + infershape_utils.cc DEPS op_info proto_desc - device_context tensor scope glog - trainer_desc_proto - data_feed_proto shape_inference data_transform lod_tensor - transfer_scope_cache op_kernel_type op_call_stack - unused_var_check detail_op_handle phi_utils - infershape_utils phi common op_compat_infos @@ -497,9 +476,8 @@ cc_library( set(NAIVE_EXECUTOR_DEPS op_registry - device_context scope - framework_proto + phi glog lod_rank_table feed_fetch_method @@ -561,8 +539,6 @@ if(WITH_DISTRIBUTE) DEPS fleet_wrapper op_registry scope - framework_proto - trainer_desc_proto glog framework_io heter_wrapper @@ -576,8 +552,6 @@ if(WITH_DISTRIBUTE) ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper - data_feed_proto - heter_service_proto ${BRPC_DEP}) set(DISTRIBUTE_COMPILE_FLAGS "") if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) @@ -631,12 +605,7 @@ if(WITH_DISTRIBUTE) device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope - framework_proto - data_feed_proto - heter_service_proto - trainer_desc_proto glog index_sampler index_wrapper @@ -652,7 +621,6 @@ if(WITH_DISTRIBUTE) feed_hook graph_to_program_pass variable_helper - heter_service_proto fleet heter_server ${${EXTERNAL_BRPC_DEPS}} @@ -705,12 +673,7 @@ if(WITH_DISTRIBUTE) device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope - framework_proto - data_feed_proto - heter_service_proto - trainer_desc_proto glog lod_rank_table framework_io @@ -763,12 +726,7 @@ elseif(WITH_PSLIB) device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope - framework_proto - data_feed_proto - heter_service_proto - trainer_desc_proto glog lod_rank_table framework_io @@ -808,12 +766,7 @@ else() device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope - framework_proto - data_feed_proto - heter_service_proto - trainer_desc_proto glog lod_rank_table framework_io @@ -850,11 +803,11 @@ cc_library( cc_library( prune SRCS prune.cc - DEPS framework_proto auto_parallel_proto proto_desc) + DEPS phi auto_parallel_proto proto_desc) cc_library( selected_rows_utils SRCS selected_rows_utils.cc - DEPS phi common device_context) + DEPS phi) cc_library( dlpack_tensor @@ -866,19 +819,6 @@ cc_library( SRCS op_compatible_info.cc DEPS string_helper proto_desc) -cc_library( - infershape_utils - SRCS infershape_utils.cc - DEPS lod_tensor - selected_rows_utils - attribute - var_type_traits - phi - common - phi_utils - op_info - shape_inference) - # Get the current working branch execute_process( COMMAND git rev-parse --abbrev-ref HEAD @@ -903,7 +843,6 @@ cc_library( SRCS custom_operator.cc DEPS tensor attribute - framework_proto op_registry operator string_helper @@ -927,7 +866,6 @@ set(FLUID_FRAMEWORK_MODULES proto_desc lod_tensor executor - data_feed_proto layer phi common diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9d39a11b0eaa8..84150a7878b43 100755 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -6,7 +6,6 @@ set(op_handle_deps operator phi common - framework_proto node device_context op_registry @@ -39,7 +38,7 @@ else() DEPS ${op_handle_deps}) endif() -add_dependencies(detail_op_handle framework_proto auto_parallel_proto xxhash) +add_dependencies(detail_op_handle auto_parallel_proto xxhash) set(IR_PASS_DEPS graph_viz_pass diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index a2bdfee5f9f31..586cd63c204d0 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -29,8 +29,7 @@ set(standalone_executor_deps garbage_collector executor_gc_helper phi - common - framework_proto) + common) if(WITH_CINN) set(standalone_executor_deps @@ -53,7 +52,7 @@ cc_library( SRCS ${standalone_executor_srcs} DEPS ${standalone_executor_deps}) -add_dependencies(standalone_executor xxhash framework_proto) +add_dependencies(standalone_executor xxhash) if(WITH_ONEDNN) add_dependencies(standalone_executor onednn) diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 0fcffc730fb22..62edd7d1394d0 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -4,30 +4,29 @@ if(WIN32) nv_library( tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc - DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context - paddle_inference_api) + DEPS ${GLOB_OPERATOR_DEPS} phi paddle_inference_api) else() nv_library( tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc - DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) + DEPS ${GLOB_OPERATOR_DEPS} phi) endif() nv_library( tensorrt_dynamic_shape_infermeta_factory SRCS dynamic_shape_infermeta.cc - DEPS framework_proto) + DEPS phi) nv_library( tensorrt_plugin_arg_mapping_context SRCS plugin_arg_mapping_context.cc - DEPS framework_proto) + DEPS phi) nv_library( tensorrt_op_teller SRCS op_teller.cc - DEPS framework_proto device_context tensorrt_dynamic_shape_infermeta_factory) + DEPS phi tensorrt_dynamic_shape_infermeta_factory) nv_test( test_tensorrt SRCS test_tensorrt.cc - DEPS device_context phi common) + DEPS phi common) if(WIN32) nv_test( test_tensorrt_engine @@ -47,7 +46,7 @@ endif() nv_test( test_arg_mapping_context SRCS test_arg_mapping_context.cc - DEPS framework_proto tensorrt_plugin_arg_mapping_context) + DEPS phi tensorrt_plugin_arg_mapping_context) if(WITH_ONNXRUNTIME AND WIN32) # Copy onnxruntime for some c++ test in Windows, since the test will diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 0bd410db10a07..a4fbf2289bd7d 100755 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -143,7 +143,7 @@ nv_library( tensorrt_plugin operator scope - framework_proto + phi tensorrt_op_teller op_registry) diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt index 25314c72b8033..174d0b70eab8c 100755 --- a/paddle/fluid/operators/pscore/CMakeLists.txt +++ b/paddle/fluid/operators/pscore/CMakeLists.txt @@ -17,14 +17,12 @@ if(WITH_ARM_BRPC) heter_server heter_client ps_framework_proto - framework_proto sendrecv_rpc arm_brpc phi common glog - snappy - device_context) + snappy) else() list( APPEND @@ -36,13 +34,11 @@ else() heter_server heter_client ps_framework_proto - framework_proto sendrecv_rpc ${EXTERNAL_BRPC_DEPS} phi common - zlib - device_context) + zlib) endif() set(DISTRIBUTE_COMPILE_FLAGS diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 32a8c69cf6d0d..f004d9c139acc 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -40,14 +40,14 @@ if(WITH_GLOO) cc_library( gloo_context SRCS gloo_context.cc - DEPS framework_proto gloo_wrapper phi common) + DEPS gloo_wrapper phi common) endif() # separate init from device_context to avoid cycle dependencies cc_library( init SRCS init.cc - DEPS device_context phi common) + DEPS phi common) #fluid_memory depends on device_context, here add deps individually for # avoiding cycle dependencies @@ -59,7 +59,6 @@ cc_library( xxhash ${STREAM_CALLBACK_DEPS} eigen3 - framework_proto ${IPU_CTX_DEPS} ${ONEDNN_CTX_DEPS} ${dgc_deps} @@ -73,13 +72,7 @@ set(DEVICE_EVENT_LIBS) cc_library( lodtensor_printer SRCS lodtensor_printer.cc - DEPS phi - common - tensor - scope - lod_tensor - variable_helper - framework_proto) + DEPS phi common tensor scope lod_tensor variable_helper) add_subdirectory(profiler) diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index c443213aa0554..80282a765ee67 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library( host_tracer SRCS host_tracer.cc - DEPS framework_proto phi common var_type_traits) + DEPS phi common var_type_traits) cc_library( cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 8d9ef68710001..67a63a86e1fae 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -21,7 +21,6 @@ if(WITH_TESTING) SRCS paddle_gtest_main.cc DEPS gtest xxhash - framework_proto eigen3 dlpack common diff --git a/test/cpp/fluid/framework/CMakeLists.txt b/test/cpp/fluid/framework/CMakeLists.txt index 6167d20121b22..bd1864e18be5c 100644 --- a/test/cpp/fluid/framework/CMakeLists.txt +++ b/test/cpp/fluid/framework/CMakeLists.txt @@ -231,7 +231,7 @@ cc_test( cc_test( infershape_utils_test SRCS infershape_utils_test.cc - DEPS infershape_utils phi common) + DEPS operator phi) if(WITH_TESTING AND TEST selected_rows_utils_test) set_tests_properties(selected_rows_utils_test PROPERTIES TIMEOUT 120) From 241999bef9b754059b54a7351b346bd777a8abc5 Mon Sep 17 00:00:00 2001 From: co63oc Date: Tue, 15 Oct 2024 13:16:30 +0800 Subject: [PATCH 131/135] Fix typos initial (#68702) --- paddle/phi/kernels/autotune/auto_tune_base.h | 4 ++-- paddle/phi/kernels/autotune/switch_autotune.cc | 2 +- paddle/phi/kernels/coalesce_tensor_kernel.cc | 2 +- paddle/phi/kernels/cpu/auc_kernel.cc | 2 +- paddle/phi/kernels/cpu/kthvalue_kernel.cc | 2 +- paddle/phi/kernels/cpu/lstsq_kernel.cc | 4 ++-- paddle/phi/kernels/cpu/rnn_kernel.cc | 4 ++-- paddle/phi/kernels/dist_grad_kernel.cc | 4 ++-- paddle/phi/kernels/fused_attention_kernel.h | 4 ++-- paddle/phi/kernels/fusion/gpu/block_attn.h | 2 +- .../fusion/gpu/block_multi_head_attention_kernel.cu | 2 +- .../kernels/fusion/xpu/multi_encoder_xpu_kernel.cc | 6 +++--- paddle/phi/kernels/is_empty_kernel.cc | 2 +- paddle/phi/kernels/memcpy_kernel.cc | 2 +- paddle/phi/kernels/primitive/compute_primitives.h | 12 ++++++------ paddle/phi/kernels/primitive/functor_primitives.h | 2 +- .../impl/dgc_clip_by_norm_kernel_impl.h | 2 +- paddle/phi/kernels/xpu/cross_entropy_kernel.cc | 2 +- 18 files changed, 30 insertions(+), 30 deletions(-) diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index fa86bd002c16e..80d7028a3082f 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -285,7 +285,7 @@ MakeGatherGemmScatterTuner(ReturnType (*func)(T, T, Args...)) { Args...>::Instance(func); } -// Define the auto_tuner inital object. +// Define the auto_tuner initial object. #define DEFINE_AUTOTUNER_COMMON_OBJ(name) \ template \ class name##AutoTuner \ @@ -305,7 +305,7 @@ MakeGatherGemmScatterTuner(ReturnType (*func)(T, T, Args...)) { } \ }; -// Define the auto_tuner inital function. +// Define the auto_tuner initial function. #define DEFINE_AUTOTUNER_FN(name) \ template \ static name##AutoTuner* Make##name##Tuner( \ diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc index 3c9ba501b5dfa..2c87caee5c946 100644 --- a/paddle/phi/kernels/autotune/switch_autotune.cc +++ b/paddle/phi/kernels/autotune/switch_autotune.cc @@ -39,7 +39,7 @@ void AutoTuneStatus::Update() { return; } - // This fuction is called when each iter finished. + // This function is called when each iter finished. if (current_steps_id_ + 1 < start_step_id_) { use_autotune_ = false; } else if (current_steps_id_ + 1 >= start_step_id_ && diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index fea53f55ce8df..637b77e5d9f08 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -84,7 +84,7 @@ void GetMemSizeAndDtype(const std::vector &lod_tensors, lod_tensors[i]->initialized() ? lod_tensors[i]->data() : nullptr; VLOG(4) << size << " " << len; ss << "input(" << i << "-th tensor) dim:(" << lod_tensors[i]->dims() << ") " - << " addres:" << ptr << " len: " << len << ", "; + << " address:" << ptr << " len: " << len << ", "; *numel += len; } VLOG(10) << ss.str(); diff --git a/paddle/phi/kernels/cpu/auc_kernel.cc b/paddle/phi/kernels/cpu/auc_kernel.cc index 3549e510b8eac..aa3a610b54694 100644 --- a/paddle/phi/kernels/cpu/auc_kernel.cc +++ b/paddle/phi/kernels/cpu/auc_kernel.cc @@ -163,7 +163,7 @@ void AucKernel(const Context &dev_ctx, auto *origin_stat_neg = dev_ctx.template Alloc(stat_neg_out); auto *auc_value = dev_ctx.template Alloc(auc); - // Just for pass UT, since UT's input & output connot be set same var + // Just for pass UT, since UT's input & output cannot be set same var auto *stat_pos_in_tensor = &stat_pos; auto *stat_neg_in_tensor = &stat_neg; auto *pos_in_data = stat_pos.data(); diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc index 0591e1bfdf320..abd731aacf66c 100644 --- a/paddle/phi/kernels/cpu/kthvalue_kernel.cc +++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc @@ -91,7 +91,7 @@ void KthvalueKernel(const Context& dev_ctx, 1, common::errors::InvalidArgument( "the k in the kthvalue must less equal than the " - "elemenents number of the input X, but received %d .", + "elements number of the input X, but received %d .", k)); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, output); diff --git a/paddle/phi/kernels/cpu/lstsq_kernel.cc b/paddle/phi/kernels/cpu/lstsq_kernel.cc index 326e0bef1906e..5fa3ec9aa4183 100644 --- a/paddle/phi/kernels/cpu/lstsq_kernel.cc +++ b/paddle/phi/kernels/cpu/lstsq_kernel.cc @@ -60,7 +60,7 @@ void LstsqKernel(const Context& dev_ctx, int max_solu_stride = std::max(y_stride, ori_solu_stride); int min_solu_stride = std::min(y_stride, ori_solu_stride); - // lapack is a column-major storge, transpose make the input to + // lapack is a column-major storage, transpose make the input to // have a continuous memory layout int info = 0; int m = static_cast(x_dims[dim_size - 2]); @@ -204,7 +204,7 @@ void LstsqKernel(const Context& dev_ctx, rwork_data = dev_ctx.template Alloc(rwork); } - // "iwork" workspace array is relavant only for "gelsd" driver + // "iwork" workspace array is relevant only for "gelsd" driver DenseTensor* iwork = new DenseTensor(); int* iwork_data = nullptr; if (driver == LapackDriverType::Gelsd) { diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc index 5b594089793c8..5e4a158e9af40 100644 --- a/paddle/phi/kernels/cpu/rnn_kernel.cc +++ b/paddle/phi/kernels/cpu/rnn_kernel.cc @@ -217,7 +217,7 @@ struct Layer { const std::string& mode, bool is_test, DenseTensor* cache_input) { - // crate the temp input for the X * W_ih^T + Bias_ih + // create the temp input for the X * W_ih^T + Bias_ih const int& hidden_size = weight.dims()[0]; // NOLINT cache_input->Resize( common::make_ddim({input.dims()[0], input.dims()[1], hidden_size})); @@ -272,7 +272,7 @@ struct Layer { DenseTensor* last_c, const DenseTensor& mask_tensor, const std::string& mode) { - // in the output, if mask flag is 0, we will retun the zero data + // in the output, if mask flag is 0, we will return the zero data auto& place = *dev_ctx.eigen_device(); auto out = EigenMatrix::Reshape(*output, output->dims().size() - 1); auto mask = EigenMatrix::From( diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 088a4fe4ffd26..a1b5482e23926 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -65,8 +65,8 @@ void DistGradKernel(const Context& dev_ctx, dev_ctx, t, out, out_grad, p, -1, 1e-12, false, true, &x_grad_tmp); if (x_grad) { - // do reduce, the implemetation of cpu SumKernel has bug, it changes - // the dims of output iternally, so we Resize x/y_grad twice. + // do reduce, the implementation of cpu SumKernel has bug, it changes + // the dims of output internally, so we Resize x/y_grad twice. auto res_x = GetReduceDims(x_grad_tmp.dims(), x.dims()); if (!std::get<0>(res_x).empty()) { x_grad->Resize(common::make_ddim(std::get<1>(res_x))); diff --git a/paddle/phi/kernels/fused_attention_kernel.h b/paddle/phi/kernels/fused_attention_kernel.h index 529c49514066d..f1595a3e4328e 100644 --- a/paddle/phi/kernels/fused_attention_kernel.h +++ b/paddle/phi/kernels/fused_attention_kernel.h @@ -42,8 +42,8 @@ namespace phi { * @param num_heads The number head for multi_head_attention. * @param transpose_qkv_wb The qkv_w shape is (h, 3h), do transpose to it. * @param pre_layer_norm if true, the attention op uses pre_layer_norm - * architecure, else, uses post_layer_norm - * architecuture. [default false]. + * architecture, else, uses post_layer_norm + * architecture. [default false]. * @param epsilon Constant for numerical stability [default 1e-5]. * @param attn_dropout_rate Probability of setting units to zero. * @param is_test (bool, default false) Set to true for inference diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h index e6dbc3fd15687..2f667c8203af7 100644 --- a/paddle/phi/kernels/fusion/gpu/block_attn.h +++ b/paddle/phi/kernels/fusion/gpu/block_attn.h @@ -1689,7 +1689,7 @@ void blha(const phi::GPUContext &dev_ctx, mask_broadcast_num_heads = false; } else { PADDLE_THROW(errors::InvalidArgument( - "Unknow dimension for attn_mask, the q_num_head(2nd) " + "Unknown dimension for attn_mask, the q_num_head(2nd) " "dimension is invalid, it should be 1 or q_num_head(%d), " "but got %d", q_num_head, diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu index 40ec00ccbb0d9..160bbcbaa3e2b 100644 --- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu @@ -544,7 +544,7 @@ void DispatchWithDtype( // unpadding_v.numel(), // "unpadding_v", // unpadding_v.numel()); - // Reshape fmha_buf to 3-D because FlashAttnUnpaddedKernel requries + // Reshape fmha_buf to 3-D because FlashAttnUnpaddedKernel requires // q,k,v,out all in 3-D [token_num, q_num_head, dim_head]. auto fmha_shape = fmha_buf.dims(); fmha_buf.Resize({token_num, q_num_head, dim_head}); diff --git a/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc index 6af5cf68dbfd2..d7fa9bea060fd 100644 --- a/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc @@ -141,8 +141,8 @@ void MultiEncoderXPUKernel( fc_weight_data_XPUTypeFP16.push_back( reinterpret_cast(fc_weight[i]->data())); } else { - // Int8 weight also convert to int16_t* for temperary storage. - // The kenerl dytpe of int8 is choosen by quant_type in + // Int8 weight also convert to int16_t* for temporary storage. + // The kernel dtype of int8 is chosen by quant_type in // xpu::transformer_encoder fc_weight_data_int16_t.push_back( reinterpret_cast(fc_weight[i]->data())); @@ -222,7 +222,7 @@ void MultiEncoderXPUKernel( if (!enable_int8 && local_quant) { TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, XPUTypeFP16, float) } else { - // The kenerl dytpe of int8 is choosen by quant_type in + // The kernel dtype of int8 is chosen by quant_type in // xpu::transformer_encoder This template args, int16_t, is only for skip // quant fc TRANSFORMER_ENCODER_KERNEL_IMPL(XPUTypeFP16, int16_t, int16_t) diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc index dadaa2132e95e..dea68c3f1415f 100644 --- a/paddle/phi/kernels/is_empty_kernel.cc +++ b/paddle/phi/kernels/is_empty_kernel.cc @@ -24,7 +24,7 @@ void IsEmptyKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { // Note: is_empty is always executed on CPU and the output data should - // always be allocated for CPUPlace. We reigister CUDA kernel for this op to + // always be allocated for CPUPlace. We register CUDA kernel for this op to // avoid the unnecessary data transform. bool* out_data = dev_ctx.template HostAlloc(out); out_data[0] = common::product(x.dims()) == 0; diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc index ddfc2cb897272..6be741df8dc54 100644 --- a/paddle/phi/kernels/memcpy_kernel.cc +++ b/paddle/phi/kernels/memcpy_kernel.cc @@ -72,7 +72,7 @@ void MemcpyD2HKernel(const Context& dev_ctx, default: PADDLE_THROW(errors::InvalidArgument( - "Arugment 'dst_place_type' only support 0-1, but got: %d", + "Argument 'dst_place_type' only support 0-1, but got: %d", dst_place_type)); break; } diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 30c2636a2bde9..82bb7f71ff6f7 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -471,8 +471,8 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) { } /* - * @brief Get ReturnsCount random data fromm compute according to state, state - * can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed + * @brief Get ReturnsCount random data from compute according to state, state + * can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has been * initialized. * * @template paraments @@ -580,8 +580,8 @@ __device__ __forceinline__ void Cumsum(OutT* out, * out: The register pointer of out, the size is 2. * in: The register pointer of input, the size is 2. * num: The num of this block - * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles - * sorted in escending. + * monotonic_type: if monotonic_type = 1 then sorted in ascending order, else + * sorted in descending. */ #define SHARED_SIZE_LIMIT 1024 // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must @@ -638,8 +638,8 @@ __device__ __forceinline__ void Sort(OutT* out, * in: The register pointer of input, the size is 2. * in_index: The register pointer of in_index, the size is 2. * num: The num of this block. - * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles - * sorted in escending. + * monotonic_type: if monotonic_type = 1 then sorted in ascending order, else + * sorted in descending. */ template __device__ __forceinline__ void Sort(OutT* out, diff --git a/paddle/phi/kernels/primitive/functor_primitives.h b/paddle/phi/kernels/primitive/functor_primitives.h index e2fcdbf7a8d2b..4ed6d24d0d6b8 100644 --- a/paddle/phi/kernels/primitive/functor_primitives.h +++ b/paddle/phi/kernels/primitive/functor_primitives.h @@ -234,7 +234,7 @@ struct DivFunctor(1.0f); } inline HOSTDEVICE T operator()(const T a, const T b) const { - // For int32/int64, need to check whether the divison is zero. + // For int32/int64, need to check whether the division is zero. PADDLE_ENFORCE_NE(b, 0, common::errors::InvalidArgument( diff --git a/paddle/phi/kernels/selected_rows/impl/dgc_clip_by_norm_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/dgc_clip_by_norm_kernel_impl.h index 1625f44896e5f..ef84271a3a8cc 100644 --- a/paddle/phi/kernels/selected_rows/impl/dgc_clip_by_norm_kernel_impl.h +++ b/paddle/phi/kernels/selected_rows/impl/dgc_clip_by_norm_kernel_impl.h @@ -38,7 +38,7 @@ void DGCClipByNormKernel(const Context& dev_ctx, if (static_cast(*current_step) < static_cast(rampup_begin_step)) { VLOG(10) << "current_step:" << *current_step << " < rampup_begin_step:" << rampup_begin_step - << " so does't use dgc_clip_by_norm"; + << " so doesn't use dgc_clip_by_norm"; return; } diff --git a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc index 46f8f5cd9feb0..c8e800d502e61 100644 --- a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc +++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc @@ -62,7 +62,7 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, // transpose before softmax due to the following two reasons: // 1. the XPU cross_entropy APIs supports cross entropy on the last dim // only, so the transpose here is unavoidable for them. - // 2. the XPU softmax api would do the transpose internaly if axis is not + // 2. the XPU softmax api would do the transpose internally if axis is not // the last dim and we can eliminate a transpose call if we explicitly // transpose the inputs before the softmax calculation. XPUType* logits_trans = RAII_GUARD.alloc_l3_or_gm(len); From 285d7fc44a4cee288066a65cdfe715d122a9385d Mon Sep 17 00:00:00 2001 From: co63oc Date: Tue, 15 Oct 2024 14:12:56 +0800 Subject: [PATCH 132/135] =?UTF-8?q?=E3=80=90Hackathon=207th=20Fundable=20P?= =?UTF-8?q?rojects=201=20No.53-54=E3=80=91=20[fluid=5Fops]=20load=20(#6865?= =?UTF-8?q?6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix * Fix * Fix --- paddle/fluid/operators/load_op.cc | 79 +--- paddle/phi/core/framework/CMakeLists.txt | 6 + paddle/phi/core/framework/convert_utils.h | 27 ++ .../core/framework/dense_tensor_tostream.cc | 375 ++++++++++++++++++ .../core/framework/dense_tensor_tostream.h | 42 ++ .../core/framework/lod_tensor_serialize.cc | 130 ++++++ .../phi/core/framework/lod_tensor_serialize.h | 53 +++ .../core/framework/selected_rows_serialize.cc | 96 +++++ .../core/framework/selected_rows_serialize.h | 46 +++ paddle/phi/core/framework/var_type_helper.cc | 167 ++++++++ paddle/phi/core/framework/var_type_helper.h | 264 ++++++++++++ paddle/phi/infermeta/nullary.cc | 2 + paddle/phi/infermeta/nullary.h | 2 + paddle/phi/kernels/cpu/load_kernel.cc | 17 + paddle/phi/kernels/gpu/load_kernel.cu | 17 + paddle/phi/kernels/impl/load_kernel_impl.h | 63 +++ .../kernels/selected_rows/cpu/load_kernel.cc | 18 + .../kernels/selected_rows/gpu/load_kernel.cu | 18 + .../selected_rows/impl/load_kernel_impl.h | 47 +++ paddle/phi/kernels/xpu/load_kernel.cc | 17 + 20 files changed, 1409 insertions(+), 77 deletions(-) create mode 100644 paddle/phi/core/framework/convert_utils.h create mode 100644 paddle/phi/core/framework/dense_tensor_tostream.cc create mode 100644 paddle/phi/core/framework/dense_tensor_tostream.h create mode 100644 paddle/phi/core/framework/lod_tensor_serialize.cc create mode 100644 paddle/phi/core/framework/lod_tensor_serialize.h create mode 100644 paddle/phi/core/framework/selected_rows_serialize.cc create mode 100644 paddle/phi/core/framework/selected_rows_serialize.h create mode 100644 paddle/phi/core/framework/var_type_helper.cc create mode 100644 paddle/phi/core/framework/var_type_helper.h create mode 100644 paddle/phi/kernels/cpu/load_kernel.cc create mode 100644 paddle/phi/kernels/gpu/load_kernel.cu create mode 100644 paddle/phi/kernels/impl/load_kernel_impl.h create mode 100644 paddle/phi/kernels/selected_rows/cpu/load_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/gpu/load_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/impl/load_kernel_impl.h create mode 100644 paddle/phi/kernels/xpu/load_kernel.cc diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index f8addec8ea1d1..7f12a382291bf 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -15,74 +15,13 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/phi/core/framework/lod_tensor_serialize.h" +#include "paddle/phi/core/framework/selected_rows_serialize.h" #include "paddle/phi/kernels/cast_kernel.h" namespace paddle { namespace operators { - -template -void LoadKernel(const Context& dev_ctx, - const std::string& file_path, - int64_t seek, - const std::vector& shape, - bool load_as_fp16, - phi::DenseTensor* out) { - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ifstream fin(file_path, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fin), - true, - common::errors::Unavailable( - "Load operator fail to open file %s, please check " - "whether the model file is complete or damaged.", - file_path)); - PADDLE_ENFORCE_NOT_NULL(out, - common::errors::InvalidArgument( - "The variable to be loaded cannot be found.")); - - if (seek != -1) { - PADDLE_ENFORCE_GE(seek, - 0, - common::errors::InvalidArgument( - "seek with tensor must great than or equal to 0")); - framework::DeserializeFromStream(fin, out, dev_ctx, seek, shape); - } else { - framework::DeserializeFromStream(fin, out, dev_ctx); - } - - auto in_dtype = out->dtype(); - auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype; - if (in_dtype != out_dtype) { - phi::CastKernel(dev_ctx, *out, out_dtype, out); - } -} - -template -void LoadSelectedRowsKernel(const Context& dev_ctx, - const std::string& file_path, - int64_t seek, - const std::vector& shape, - bool load_as_fp16, - phi::SelectedRows* out) { - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ifstream fin(file_path, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fin), - true, - common::errors::Unavailable( - "Load operator fail to open file %s, please check " - "whether the model file is complete or damaged.", - file_path)); - PADDLE_ENFORCE_NOT_NULL(out, - common::errors::InvalidArgument( - "The variable to be loaded cannot be found.")); - - framework::DeserializeFromStream(fin, out, dev_ctx); -} - class LoadOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -128,17 +67,3 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); - -PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {} -PD_REGISTER_KERNEL( - load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {} -PD_REGISTER_KERNEL( - load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {} -#endif - -#ifdef PADDLE_WITH_XPU -PD_REGISTER_KERNEL(load, XPU, ALL_LAYOUT, ops::LoadKernel, float) {} -#endif diff --git a/paddle/phi/core/framework/CMakeLists.txt b/paddle/phi/core/framework/CMakeLists.txt index dc025f19903c2..977d50dbf2730 100644 --- a/paddle/phi/core/framework/CMakeLists.txt +++ b/paddle/phi/core/framework/CMakeLists.txt @@ -4,3 +4,9 @@ proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto data_feed_proto) proto_library(heter_service_proto SRCS heter_service.proto) + +file( + GLOB framework_cc + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "*.cc") +collect_srcs(core_srcs SRCS ${framework_cc}) diff --git a/paddle/phi/core/framework/convert_utils.h b/paddle/phi/core/framework/convert_utils.h new file mode 100644 index 0000000000000..536970a3bd86e --- /dev/null +++ b/paddle/phi/core/framework/convert_utils.h @@ -0,0 +1,27 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/framework/var_type_helper.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +inline proto::VarType::Type TransToProtoVarTypeReturnType( + const DataType& dtype) { + return static_cast(phi::TransToProtoVarType(dtype)); +} + +} // namespace phi diff --git a/paddle/phi/core/framework/dense_tensor_tostream.cc b/paddle/phi/core/framework/dense_tensor_tostream.cc new file mode 100644 index 0000000000000..c370f51845574 --- /dev/null +++ b/paddle/phi/core/framework/dense_tensor_tostream.cc @@ -0,0 +1,375 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/framework/dense_tensor_tostream.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/framework/convert_utils.h" +#include "paddle/phi/core/kernel_factory.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/contiguous_kernel.h" + +namespace phi { + +namespace proto = paddle::framework::proto; + +template +phi::DenseTensor InnerTensorContiguous(const Context& dev_ctx, + const phi::DenseTensor& tensor) { + phi::DenseTensor dense_out; + phi::MetaTensor meta_input(tensor); + phi::MetaTensor meta_out(&dense_out); + UnchangedInferMeta(meta_input, &meta_out); + + PD_VISIT_ALL_TYPES(tensor.dtype(), "InnerTensorContiguous", ([&] { + phi::ContiguousKernel( + dev_ctx, tensor, &dense_out); + })); + return dense_out; +} + +phi::DenseTensor InnerTensorContiguous(const phi::DenseTensor& tensor) { + auto& pool = phi::DeviceContextPool::Instance(); + + if (tensor.place().GetType() == phi::AllocationType::CPU) { + auto* dev_ctx = static_cast(pool.Get(tensor.place())); + return InnerTensorContiguous(*dev_ctx, tensor); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + } else if (tensor.place().GetType() == phi::AllocationType::GPU) { + auto* dev_ctx = static_cast(pool.Get(tensor.place())); + return InnerTensorContiguous(*dev_ctx, tensor); +#endif +#ifdef PADDLE_WITH_XPU + } else if (tensor.place().GetType() == phi::AllocationType::XPU) { + auto* dev_ctx = static_cast(pool.Get(tensor.place())); + return InnerTensorContiguous(*dev_ctx, tensor); +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (tensor.place().GetType() == phi::AllocationType::CUSTOM) { + auto* dev_ctx = static_cast(pool.Get(tensor.place())); + phi::DenseTensor dense_out; + phi::MetaTensor meta_input(tensor); + phi::MetaTensor meta_out(&dense_out); + UnchangedInferMeta(meta_input, &meta_out); + const phi::KernelKey& kernel_key = {phi::TransToPhiBackend(tensor.place()), + phi::DataLayout::ALL_LAYOUT, + tensor.dtype()}; + using kernel_signature = void (*)( + const phi::DeviceContext&, const phi::DenseTensor&, phi::DenseTensor*); + PD_VISIT_KERNEL("contiguous", + kernel_key, + kernel_signature, + false, + *dev_ctx, + tensor, + &dense_out); + return dense_out; +#endif + } else { + PADDLE_THROW(common::errors::Unimplemented( + "Place type is not supported when casting data type.")); + } + + return tensor; +} + +void TensorToStream(std::ostream& os, + const phi::DenseTensor& tensor, + const phi::DeviceContext& dev_ctx) { + const auto ensure_contiguous = [](const phi::DenseTensor& tensor) { + if (tensor.meta().is_contiguous()) { + return tensor; + } + return InnerTensorContiguous(tensor); + }; + const phi::DenseTensor& contiguous_tensor = ensure_contiguous(tensor); + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::VarType::TensorDesc desc; + desc.set_data_type( + TransToProtoVarTypeReturnType(contiguous_tensor.dtype())); + auto dims = common::vectorize(contiguous_tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = + contiguous_tensor.numel() * phi::SizeOf(contiguous_tensor.dtype()); + + auto* data_ptr = contiguous_tensor.data(); + PADDLE_ENFORCE_LT(size, + (std::numeric_limits::max)(), + common::errors::ResourceExhausted( + "tensor size %d overflow when writing tensor", size)); + if (phi::is_gpu_place(contiguous_tensor.place())) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = static_cast(dev_ctx); + phi::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + phi::memory_utils::Copy(cpu, + buf.get(), + contiguous_tensor.place(), + reinterpret_cast(data), // NOLINT + size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(common::errors::Unimplemented( + "CUDAPlace is not supported when not compiled with CUDA")); +#endif + } else if (phi::is_xpu_place(contiguous_tensor.place())) { +#ifdef PADDLE_WITH_XPU + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& xpu_dev_ctx = static_cast(dev_ctx); + phi::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + phi::memory_utils::Copy(cpu, + buf.get(), + contiguous_tensor.place(), + reinterpret_cast(data), + size_to_write); + xpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(common::errors::Unimplemented( + "XPUPlace is not supported when not compiled with XPU")); +#endif + } else if (phi::is_custom_place(contiguous_tensor.place())) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); // NOLINT + auto& custom_device_context = + static_cast(dev_ctx); + phi::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + phi::memory_utils::Copy(cpu, + buf.get(), + contiguous_tensor.place(), + reinterpret_cast(data), + size_to_write, + custom_device_context.stream()); + custom_device_context.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(common::errors::Unimplemented( + "CustomPlace is not supported when not compiled with " + "CustomDevice")); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +struct DeserializedDataFunctor { + DeserializedDataFunctor(void** buf, + phi::DenseTensor* tensor, + const phi::Place& place) + : buf_(buf), tensor_(tensor), place_(place) {} + + template + void apply() { + auto& pool = phi::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place_); + *buf_ = dev_ctx->Alloc(tensor_); + } + + void** buf_; + phi::DenseTensor* tensor_; + phi::Place place_; +}; + +void TensorFromStream(std::istream& is, + phi::DenseTensor* tensor, + const phi::DeviceContext& dev_ctx, + const size_t& seek, + const std::vector& shape) { + uint32_t version = 0; + is.read(reinterpret_cast(&version), sizeof(version)); + + PADDLE_ENFORCE_EQ( + version, + 0U, + common::errors::InvalidArgument( + "tensor version %u is not supported, Only version 0 is supported", + version)); + + proto::VarType::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size = 0; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); // NOLINT + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE_EQ( + desc.ParseFromArray(buf.get(), size), + true, + common::errors::InvalidArgument("Cannot parse tensor desc")); + } + { // read tensor + tensor->Resize(common::make_ddim(shape)); + size_t seekg = seek * SizeOfType(desc.data_type()); + is.seekg(seekg, is.cur); // NOLINT + + void* buf = nullptr; + phi::CPUContext ctx; + size_t size = tensor->numel() * SizeOfType(desc.data_type()); + if (phi::is_gpu_place(dev_ctx.GetPlace()) || + phi::is_xpu_place(dev_ctx.GetPlace()) || + phi::is_custom_place(dev_ctx.GetPlace())) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) + phi::DenseTensor cpu_tensor; + cpu_tensor.Resize(common::make_ddim(shape)); + VisitDataType(desc.data_type(), + DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); // NOLINT + auto dst_place = dev_ctx.GetPlace(); + phi::Copy(dev_ctx, cpu_tensor, dst_place, false, tensor); + if (phi::is_custom_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } +#else + if (phi::is_gpu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(common::errors::Unimplemented( + "CUDAPlace is not supported when not compiled with CUDA")); + } else if (phi::is_xpu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(common::errors::Unimplemented( + "XPUPlace is not supported when not compiled with XPU")); + } +#endif + } else { + VisitDataType(desc.data_type(), + DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); // NOLINT + } + } +} + +void TensorFromStream(std::istream& is, + phi::DenseTensor* tensor, + const phi::DeviceContext& dev_ctx) { + uint32_t version = 0; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ( + version, + 0U, + common::errors::InvalidArgument( + "tensor version %u is not supported, Only version 0 is supported", + version)); + proto::VarType::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size = -1; + is.read(reinterpret_cast(&size), sizeof(size)); + PADDLE_ENFORCE_EQ( + is.good(), + true, + common::errors::Unavailable("Cannot read tensor desc size")); + PADDLE_ENFORCE_GE(size, + 0, + common::errors::InvalidArgument( + "phi::DenseTensor desc size should >= 0")); + std::unique_ptr buf(new char[size]); // NOLINT + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE_EQ( + desc.ParseFromArray(buf.get(), size), + true, + common::errors::InvalidArgument("Cannot parse tensor desc")); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(common::make_ddim(dims)); + void* buf = nullptr; + phi::CPUContext ctx; + size_t size = tensor->numel() * SizeOfType(desc.data_type()); + if (phi::is_gpu_place(dev_ctx.GetPlace()) || + phi::is_xpu_place(dev_ctx.GetPlace()) || + phi::is_custom_place(dev_ctx.GetPlace())) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) + phi::DenseTensor cpu_tensor; + cpu_tensor.Resize(common::make_ddim(dims)); + VisitDataType(desc.data_type(), + DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); // NOLINT + auto dst_place = dev_ctx.GetPlace(); + phi::Copy(dev_ctx, cpu_tensor, dst_place, false, tensor); + if (phi::is_custom_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } +#else + if (phi::is_gpu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(common::errors::Unimplemented( + "CUDAPlace is not supported when not compiled with CUDA")); + } else if (phi::is_xpu_place(dev_ctx.GetPlace())) { + PADDLE_THROW(common::errors::Unimplemented( + "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW( + common::errors::Unimplemented("CustomPlace is not supported when " + "not compiled with CustomDevice")); + } +#endif + } else { + VisitDataType(desc.data_type(), + DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); // NOLINT + } + } +} + +} // namespace phi diff --git a/paddle/phi/core/framework/dense_tensor_tostream.h b/paddle/phi/core/framework/dense_tensor_tostream.h new file mode 100644 index 0000000000000..945babff45bf6 --- /dev/null +++ b/paddle/phi/core/framework/dense_tensor_tostream.h @@ -0,0 +1,42 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/framework/var_type_helper.h" +#include "paddle/phi/core/memory/memory.h" +#include "paddle/phi/core/platform/device_context.h" + +namespace phi { + +TEST_API void TensorToStream(std::ostream& os, + const phi::DenseTensor& tensor, + const phi::DeviceContext& dev_ctx); +TEST_API void TensorFromStream(std::istream& is, + phi::DenseTensor* tensor, + const phi::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, + phi::DenseTensor* tensor, + const phi::DeviceContext& dev_ctx, + const size_t& seek, + const std::vector& shape); + +} // namespace phi diff --git a/paddle/phi/core/framework/lod_tensor_serialize.cc b/paddle/phi/core/framework/lod_tensor_serialize.cc new file mode 100644 index 0000000000000..2a06780c9ca1a --- /dev/null +++ b/paddle/phi/core/framework/lod_tensor_serialize.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/framework/lod_tensor_serialize.h" +#include +#include "paddle/phi/core/framework/convert_utils.h" + +namespace phi { + +void SerializeToStream(std::ostream &os, + const phi::DenseTensor &tensor, + const phi::DeviceContext &dev_ctx) { + constexpr uint32_t kCurTensorVersion = 0; + { // the 1st field, uint32_t version for DenseTensor + os.write(reinterpret_cast(&kCurTensorVersion), + sizeof(kCurTensorVersion)); + } + { + // the 2st field, LoD information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... + auto lod = tensor.lod(); + uint64_t size = lod.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + + for (auto &each : lod) { + size = each.size() * sizeof(phi::LoD::value_type::value_type); + os.write(reinterpret_cast(&size), sizeof(size)); + os.write(reinterpret_cast(each.data()), + static_cast(size)); + } + } + // the 3st field, Tensor + TensorToStream(os, static_cast(tensor), dev_ctx); +} + +void SerializeToStream(std::ostream &os, const phi::DenseTensor &tensor) { + phi::DeviceContextPool &pool = phi::DeviceContextPool::Instance(); + const phi::DeviceContext *dev_ctx = nullptr; + auto place = tensor.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, tensor, *dev_ctx); +} + +void DeserializeFromStream(std::istream &os, phi::DenseTensor *tensor) { + phi::DeviceContextPool &pool = phi::DeviceContextPool::Instance(); + const phi::DeviceContext *dev_ctx = nullptr; + dev_ctx = pool.Get(phi::CPUPlace()); + DeserializeFromStream(os, tensor, *dev_ctx); +} + +void DeserializeFromStream(std::istream &is, + phi::DenseTensor *tensor, + const phi::DeviceContext &dev_ctx, + const size_t &seek, + const std::vector &shape) { + { + // the 1st field, unit32_t version for DenseTensor + uint32_t version = 0; + is.read(reinterpret_cast(&version), sizeof(version)); + + PADDLE_ENFORCE_EQ( + version, + 0U, + common::errors::InvalidArgument( + "Deserialize to tensor failed, maybe the loaded file is " + "not a paddle model(expected file format: 0, but %u found).", + version)); + } + { + // the 2st field, LoD information + uint64_t lod_level = 0; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + } + // the 3st filed, Tensor + TensorFromStream( + is, static_cast(tensor), dev_ctx, seek, shape); +} + +void DeserializeFromStream(std::istream &is, + phi::DenseTensor *tensor, + const phi::DeviceContext &dev_ctx) { + { + // the 1st field, unit32_t version for DenseTensor + uint32_t version = 0; + is.read(reinterpret_cast(&version), sizeof(version)); + + PADDLE_ENFORCE_EQ( + version, + 0U, + common::errors::InvalidArgument( + "Deserialize to tensor failed, maybe the loaded file is " + "not a paddle model(expected file format: 0, but %u found).", + version)); + } + { + // the 2st field, LoD information + uint64_t lod_level = 0; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size = 0; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + } + // the 3st filed, Tensor + TensorFromStream(is, static_cast(tensor), dev_ctx); +} + +} // namespace phi diff --git a/paddle/phi/core/framework/lod_tensor_serialize.h b/paddle/phi/core/framework/lod_tensor_serialize.h new file mode 100644 index 0000000000000..55d17eeaf4534 --- /dev/null +++ b/paddle/phi/core/framework/lod_tensor_serialize.h @@ -0,0 +1,53 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/common/ddim.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/framework/dense_tensor_tostream.h" +#include "paddle/phi/core/mixed_vector.h" +#include "paddle/utils/test_macros.h" + +namespace phi { + +/* + * Serialize/Deserialize phi::DenseTensor to std::ostream + * You can pass ofstream or ostringstream to serialize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, + const phi::DenseTensor& tensor, + const phi::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, + phi::DenseTensor* tensor, + const phi::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, + phi::DenseTensor* tensor, + const phi::DeviceContext& dev_ctx, + const size_t& seek, + const std::vector& shape); + +void SerializeToStream(std::ostream& os, const phi::DenseTensor& tensor); + +void DeserializeFromStream(std::istream& os, phi::DenseTensor* tensor); + +} // namespace phi diff --git a/paddle/phi/core/framework/selected_rows_serialize.cc b/paddle/phi/core/framework/selected_rows_serialize.cc new file mode 100644 index 0000000000000..d4fba8d1d6ed6 --- /dev/null +++ b/paddle/phi/core/framework/selected_rows_serialize.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/framework/selected_rows_serialize.h" + +namespace phi { + +void SerializeToStream(std::ostream& os, + const phi::SelectedRows& selected_rows, + const phi::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + TensorToStream(os, selected_rows.value(), dev_ctx); +} + +void SerializeToStream(std::ostream& os, + const phi::SelectedRows& selected_rows) { + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + const phi::DeviceContext* dev_ctx = nullptr; + auto place = selected_rows.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, selected_rows, *dev_ctx); +} + +void DeserializeFromStream(std::istream& is, phi::SelectedRows* selected_rows) { + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + const phi::DeviceContext* dev_ctx = nullptr; + dev_ctx = pool.Get(phi::CPUPlace()); + DeserializeFromStream(is, selected_rows, *dev_ctx); +} + +void DeserializeFromStream(std::istream& is, + phi::SelectedRows* selected_rows, + const phi::DeviceContext& dev_ctx) { + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version = 0; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, + 0U, + common::errors::InvalidArgument( + "Only version 0 SelectedRows is supported.")); + } + { + // the 2st field, rows information + uint64_t size = 0; + is.read(reinterpret_cast(&size), sizeof(size)); + PADDLE_ENFORCE_EQ( + is.good(), + true, + common::errors::Unavailable("Cannot read the number of rows.")); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height = 0; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + TensorFromStream(is, selected_rows->mutable_value(), dev_ctx); +} + +} // namespace phi diff --git a/paddle/phi/core/framework/selected_rows_serialize.h b/paddle/phi/core/framework/selected_rows_serialize.h new file mode 100644 index 0000000000000..82af6a7374e6d --- /dev/null +++ b/paddle/phi/core/framework/selected_rows_serialize.h @@ -0,0 +1,46 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/phi/core/framework/dense_tensor_tostream.h" +#include "paddle/phi/core/platform/device_context.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +/* + * Serialize/Deserialize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serialize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, + const phi::SelectedRows& selected_rows, + const phi::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, + phi::SelectedRows* selected_rows, + const phi::DeviceContext& dev_ctx); + +void SerializeToStream(std::ostream& os, + const phi::SelectedRows& selected_rows); + +void DeserializeFromStream(std::istream& is, phi::SelectedRows* selected_rows); + +} // namespace phi diff --git a/paddle/phi/core/framework/var_type_helper.cc b/paddle/phi/core/framework/var_type_helper.cc new file mode 100644 index 0000000000000..03d2708f8bb8c --- /dev/null +++ b/paddle/phi/core/framework/var_type_helper.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/framework/var_type_helper.h" + +#include + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/pstring.h" + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using pstring = phi::dtype::pstring; + +namespace phi { + +struct DataTypeMap { + std::unordered_map cpp_to_proto_; + std::unordered_map proto_to_cpp_; + std::unordered_map proto_to_str_; + std::unordered_map proto_to_size_; +}; + +static DataTypeMap* InitDataTypeMap(); +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex +static DataTypeMap& gDataTypeMap() { + static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); + return *g_data_type_map_; +} + +template +static inline void RegisterType(DataTypeMap* map, + proto::VarType::Type proto_type, + const std::string& name) { + map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); + map->cpp_to_proto_.emplace(typeid(T), proto_type); + map->proto_to_str_.emplace(static_cast(proto_type), name); + map->proto_to_size_.emplace(static_cast(proto_type), sizeof(T)); +} + +static DataTypeMap* InitDataTypeMap() { + auto retv = new DataTypeMap(); + +#define RegType(cc_type, proto_type) \ + RegisterType(retv, proto_type, #cc_type) + + _ForEachDataType_(RegType); + // Register pstring individually + RegType(pstring, proto::VarType::PSTRING); + RegType(::phi::dtype::float8_e5m2, proto::VarType::FP8_E5M2); + RegType(::phi::dtype::float8_e4m3fn, proto::VarType::FP8_E4M3FN); +#undef RegType + return retv; +} + +proto::VarType::Type ToDataType(std::type_index type) { + auto it = gDataTypeMap().cpp_to_proto_.find(type); + if (it != gDataTypeMap().cpp_to_proto_.end()) { + return it->second; + } + PADDLE_THROW(common::errors::Unimplemented( + "Not support %s as tensor data type.", common::demangle(type.name()))); +} + +std::type_index ToTypeIndex(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_cpp_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_cpp_.end()) { + return it->second; + } + PADDLE_THROW(common::errors::Unimplemented( + "Not support proto::VarType::Type(%d) as tensor type.", + static_cast(type))); +} + +std::string DataTypeToString(const proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_str_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_str_.end()) { + return it->second; + } + // deal with RAW type + if (type == proto::VarType::RAW) { + return "RAW(runtime decided type)"; + } + PADDLE_THROW(common::errors::Unimplemented( + "Not support proto::VarType::Type(%d) as tensor type.", + static_cast(type))); +} + +size_t SizeOfType(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_size_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_size_.end()) { + return it->second; + } + PADDLE_THROW(common::errors::Unimplemented("Not support %s as tensor type.", + DataTypeToString(type))); +} + +// Now only supports promotion of complex type +inline bool NeedPromoteTypes(const proto::VarType::Type& a, + const proto::VarType::Type& b) { + return (IsComplexType(a) || IsComplexType(b)); +} + +int DataTypeNumAlign(const proto::VarType::Type t) { + int cast_type_num = -1; + if (t == proto::VarType::FP32 || t == proto::VarType::FP64) { + cast_type_num = static_cast(t) - 5; + } else if (t == proto::VarType::COMPLEX64 || + t == proto::VarType::COMPLEX128) { + cast_type_num = static_cast(t) - 21; + } else { + PADDLE_THROW(common::errors::Unavailable( + "Only supports to align data type include float32, float64, complex64 " + "and complex128, but received data type is `s`.", + DataTypeToString(t))); + } + return cast_type_num; +} + +// Now only supports promotion of complex type +proto::VarType::Type PromoteTypesIfComplexExists( + const proto::VarType::Type type_a, const proto::VarType::Type type_b) { + constexpr auto f4 = proto::VarType::FP32; // 5 + constexpr auto f8 = proto::VarType::FP64; // 6 + constexpr auto c4 = proto::VarType::COMPLEX64; // 23 + constexpr auto c8 = proto::VarType::COMPLEX128; // 24 + + if (!NeedPromoteTypes(type_a, type_b)) { + // NOTE(chenweihang): keep consistent with rule in original op's impl, + // kernel type based on the first input tensor's dtype + return type_a; + } + + int type_an = DataTypeNumAlign(type_a); + int type_bn = DataTypeNumAlign(type_b); + + // Here is a complete rules table, but some rules are not used. + // It is still written this way because array accessing is still + // more efficient than if-else + // NOLINTBEGIN(*-avoid-c-arrays) + static constexpr proto::VarType::Type promote_types_table[4][4] = { + /* f4 f8 c4 c8*/ + /* f4 */ {f4, f8, c4, c8}, + /* f8 */ {f8, f8, c8, c8}, + /* c4 */ {c4, c8, c4, c8}, + /* c8 */ {c8, c8, c8, c8}, + }; + // NOLINTEND(*-avoid-c-arrays) + + return promote_types_table[type_an][type_bn]; +} + +} // namespace phi diff --git a/paddle/phi/core/framework/var_type_helper.h b/paddle/phi/core/framework/var_type_helper.h new file mode 100644 index 0000000000000..0be8e88de4f45 --- /dev/null +++ b/paddle/phi/core/framework/var_type_helper.h @@ -0,0 +1,264 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/float8_e4m3fn.h" +#include "paddle/phi/common/float8_e5m2.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/framework/framework.pb.h" +#include "paddle/utils/test_macros.h" + +namespace proto = paddle::framework::proto; + +namespace phi { + +TEST_API std::string DataTypeToString(const proto::VarType::Type type); +TEST_API extern size_t SizeOfType(proto::VarType::Type type); + +template +struct IsComplex : public std::false_type {}; + +template +struct IsComplex> : public std::true_type {}; + +template +struct DataTypeTrait {}; + +// Stub handle for void +template <> +struct DataTypeTrait { + constexpr static proto::VarType::Type DataType() { + return proto::VarType::RAW; + } +}; + +#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ + callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); + +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::bfloat16, BF16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX64); \ + _ForEachDataTypeHelper_( \ + callback, ::phi::dtype::complex, COMPLEX128); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float8_e4m3fn, FP8_E4M3FN); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float8_e5m2, FP8_E5M2); + +#define _ForEachIntDataType_(callback) \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); + +#define _ForEachDataTypeSmall_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX64); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX128); + +#define _ForEachDataTypeNormal_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::bfloat16, BF16); + +// For the use of thrust, as index-type elements can be only integers. +#define _ForEachDataTypeTiny_(callback) \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); + +// It's only for DataParallel in HIP, bf16 not support in HIP. +#define _ForEachDataTypeForHIP_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX64); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX128); + +#define DefineDataTypeTrait(cpp_type, proto_type) \ + template <> \ + struct DataTypeTrait { \ + constexpr static proto::VarType::Type DataType() { return proto_type; } \ + } + +_ForEachDataType_(DefineDataTypeTrait); + +#undef DefineDataTypeTrait + +TEST_API extern proto::VarType::Type ToDataType(std::type_index type); +extern std::type_index ToTypeIndex(proto::VarType::Type type); + +template +inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { +#define VisitDataTypeCallback(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(VisitDataTypeCallback); +#undef VisitDataTypeCallback + PADDLE_THROW(common::errors::Unimplemented( + "Not supported proto::VarType::Type(%d) as data type.", + static_cast(type))); +} + +template +inline void VisitDataTypeSmall(proto::VarType::Type type, Visitor visitor) { +#define VisitDataTypeCallbackSmall(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataTypeSmall_(VisitDataTypeCallbackSmall); +#undef VisitDataTypeCallbackSmall +} + +// for normal dtype, int, int64, float, float64, float16 +template +inline void VisitDataTypeNormal(proto::VarType::Type type, Visitor visitor) { +#define VisitDataTypeCallbackNormal(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataTypeNormal_(VisitDataTypeCallbackNormal); +#undef VisitDataTypeCallbackNormal +} + +template +inline void VisitIntDataType(proto::VarType::Type type, Visitor visitor) { +#define VisitIntDataTypeCallback(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachIntDataType_(VisitIntDataTypeCallback); + + PADDLE_THROW(common::errors::Unimplemented( + "Expected integral data type, but got %s", DataTypeToString(type))); + +#undef VisitIntDataTypeCallback +} + +template +inline void VisitDataTypeTiny(proto::VarType::Type type, Visitor visitor) { +#define VisitDataTypeCallbackTiny(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataTypeTiny_(VisitDataTypeCallbackTiny); +#undef VisitDataTypeCallbackTiny +} + +template +inline void VisitDataTypeForHIP(proto::VarType::Type type, Visitor visitor) { +#define VisitDataTypeCallbackHIP(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataTypeForHIP_(VisitDataTypeCallbackHIP); +#undef VisitDataTypeCallbackHIP +} + +inline std::ostream& operator<<(std::ostream& out, + const proto::VarType::Type& type) { + out << DataTypeToString(type); + return out; +} + +extern inline bool IsComplexType(const proto::VarType::Type& type) { + return (type == proto::VarType::COMPLEX64 || + type == proto::VarType::COMPLEX128); +} + +extern proto::VarType::Type PromoteTypesIfComplexExists( + const proto::VarType::Type type_a, const proto::VarType::Type type_b); + +extern inline proto::VarType::Type ToComplexType(proto::VarType::Type t) { + switch (t) { + case proto::VarType::FP32: + return proto::VarType::COMPLEX64; + case proto::VarType::FP64: + return proto::VarType::COMPLEX128; + default: + PADDLE_THROW(common::errors::Unimplemented( + "Unknown real value data type (%s), now only support float32 and " + "float64.", + DataTypeToString(t))); + } +} + +extern inline proto::VarType::Type ToRealType(proto::VarType::Type t) { + switch (t) { + case proto::VarType::COMPLEX64: + return proto::VarType::FP32; + case proto::VarType::COMPLEX128: + return proto::VarType::FP64; + default: + PADDLE_THROW(common::errors::Unimplemented( + "Unknown complex value data type (%s), now only support complex64 " + "and " + "complex128.", + DataTypeToString(t))); + } +} + +} // namespace phi diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index f18ed5690f0f3..50d5449b6f4d2 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -184,6 +184,8 @@ void PartialRecvInferMeta(int ring_id, out->set_dtype(dtype); } +void LoadInferMeta(MetaTensor* out, MetaConfig config) {} + void RandpermInferMeta(int n, DataType dtype, MetaTensor* out) { out->set_dims(common::make_ddim({n})); out->set_dtype(dtype); diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index db2835aef23f7..3cbc1cff0390b 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -78,6 +78,8 @@ void GaussianInferMeta(const IntArray& shape, DataType dtype, MetaTensor* out); +void LoadInferMeta(MetaTensor* out, MetaConfig config = MetaConfig()); + void RandpermInferMeta(int n, DataType dtype, MetaTensor* out); void RandintInferMeta( diff --git a/paddle/phi/kernels/cpu/load_kernel.cc b/paddle/phi/kernels/cpu/load_kernel.cc new file mode 100644 index 0000000000000..acf4c2a53d3f8 --- /dev/null +++ b/paddle/phi/kernels/cpu/load_kernel.cc @@ -0,0 +1,17 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/load_kernel_impl.h" + +PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, phi::LoadKernel, float) {} diff --git a/paddle/phi/kernels/gpu/load_kernel.cu b/paddle/phi/kernels/gpu/load_kernel.cu new file mode 100644 index 0000000000000..7de6d1bdfe915 --- /dev/null +++ b/paddle/phi/kernels/gpu/load_kernel.cu @@ -0,0 +1,17 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/load_kernel_impl.h" + +PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, phi::LoadKernel, float) {} diff --git a/paddle/phi/kernels/impl/load_kernel_impl.h b/paddle/phi/kernels/impl/load_kernel_impl.h new file mode 100644 index 0000000000000..076185b442c26 --- /dev/null +++ b/paddle/phi/kernels/impl/load_kernel_impl.h @@ -0,0 +1,63 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/framework/lod_tensor_serialize.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/cast_kernel.h" + +namespace phi { + +template +void LoadKernel(const Context& dev_ctx, + const std::string& file_path, + int64_t seek, + const std::vector& shape, + bool load_as_fp16, + phi::DenseTensor* out) { + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ifstream fin(file_path, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), + true, + errors::Unavailable("Load operator fail to open file %s, please check " + "whether the model file is complete or damaged.", + file_path)); + PADDLE_ENFORCE_NOT_NULL( + out, + errors::InvalidArgument("The variable to be loaded cannot be found.")); + + if (seek != -1) { + PADDLE_ENFORCE_GE(seek, + 0, + errors::InvalidArgument( + "seek with tensor must great than or equal to 0")); + phi::DeserializeFromStream(fin, out, dev_ctx, seek, shape); + } else { + phi::DeserializeFromStream(fin, out, dev_ctx); + } + + auto in_dtype = out->dtype(); + auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype; + if (in_dtype != out_dtype) { + phi::CastKernel(dev_ctx, *out, out_dtype, out); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/cpu/load_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/load_kernel.cc new file mode 100644 index 0000000000000..618ee9bae7da6 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/cpu/load_kernel.cc @@ -0,0 +1,18 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/impl/load_kernel_impl.h" + +PD_REGISTER_KERNEL( + load_sr, CPU, ALL_LAYOUT, phi::sr::LoadSelectedRowsKernel, float) {} diff --git a/paddle/phi/kernels/selected_rows/gpu/load_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/load_kernel.cu new file mode 100644 index 0000000000000..73e5a20ce4c18 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/gpu/load_kernel.cu @@ -0,0 +1,18 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/impl/load_kernel_impl.h" + +PD_REGISTER_KERNEL( + load_sr, GPU, ALL_LAYOUT, phi::sr::LoadSelectedRowsKernel, float) {} diff --git a/paddle/phi/kernels/selected_rows/impl/load_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/load_kernel_impl.h new file mode 100644 index 0000000000000..15cf50889594c --- /dev/null +++ b/paddle/phi/kernels/selected_rows/impl/load_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/framework/selected_rows_serialize.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" + +namespace phi::sr { + +template +void LoadSelectedRowsKernel(const Context& dev_ctx, + const std::string& file_path, + int64_t seek, + const std::vector& shape, + bool load_as_fp16, + phi::SelectedRows* out) { + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ifstream fin(file_path, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), + true, + errors::Unavailable("Load operator fail to open file %s, please check " + "whether the model file is complete or damaged.", + file_path)); + PADDLE_ENFORCE_NOT_NULL( + out, + errors::InvalidArgument("The variable to be loaded cannot be found.")); + + phi::DeserializeFromStream(fin, out, dev_ctx); +} +} // namespace phi::sr diff --git a/paddle/phi/kernels/xpu/load_kernel.cc b/paddle/phi/kernels/xpu/load_kernel.cc new file mode 100644 index 0000000000000..efcb0cf7f13e8 --- /dev/null +++ b/paddle/phi/kernels/xpu/load_kernel.cc @@ -0,0 +1,17 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/load_kernel_impl.h" + +PD_REGISTER_KERNEL(load, XPU, ALL_LAYOUT, phi::LoadKernel, float) {} From 2b1bd49408f0042a7d526a4487c4b66a1cfe2a52 Mon Sep 17 00:00:00 2001 From: umiswing Date: Tue, 15 Oct 2024 14:13:12 +0800 Subject: [PATCH 133/135] Support densemask with causal in flash attention (#68696) * update flashattn submodule * support dense mask with causal. * update submodule * refine --- paddle/phi/kernels/gpu/flash_attn_utils.h | 5 ----- third_party/flashattn | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/flash_attn_utils.h b/paddle/phi/kernels/gpu/flash_attn_utils.h index b36ae680cc582..cebccd8e1eaac 100644 --- a/paddle/phi/kernels/gpu/flash_attn_utils.h +++ b/paddle/phi/kernels/gpu/flash_attn_utils.h @@ -179,11 +179,6 @@ struct FlashAttnParamsBase { softmax_lse_dims = {batch_size, num_heads, seqlen_q_rounded}; if (attn_mask_tensor) { - PADDLE_ENFORCE_NE(causal, - true, - common::errors::InvalidArgument( - "When attn_mask is set, causal can not be true.")); - PADDLE_ENFORCE_EQ( attn_mask->dtype(), q_dtype, diff --git a/third_party/flashattn b/third_party/flashattn index 9741fce0ee752..d8915628a941d 160000 --- a/third_party/flashattn +++ b/third_party/flashattn @@ -1 +1 @@ -Subproject commit 9741fce0ee752a6fa65acd98f3adec23e636e0c7 +Subproject commit d8915628a941d946c0f962e628e28de5469ae690 From d47724828d747aa5cc34560d4afa8c42f45ac7f9 Mon Sep 17 00:00:00 2001 From: BiynXu <62832681+BiynXu@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:14:40 +0800 Subject: [PATCH 134/135] [CINN]Remove warp reduce on one thread reduction (#68587) --- .../group_schedule/tactic/tile_first_general_tactic.cc | 10 +++++++--- paddle/cinn/optim/replace_cross_thread_reduction.cc | 7 +++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc index 6e12bb01af12f..ff4f544cc57ee 100644 --- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc @@ -115,6 +115,8 @@ void TileFirstGeneralTactic::Init(ScheduleContext* context) { } } } + + map_rf_block_.clear(); } void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch, @@ -215,8 +217,9 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile( loops = sch->GetLoops(block_id); sch->Reorder({loops[current_reduce_axis + 1], loops[current_reduce_axis]}); - if (IsReductionSBlock(sch->GetBlock(block_id))) { - loops = sch->GetLoops(block_id); + loops = sch->GetLoops(block_id); + if (IsReductionSBlock(sch->GetBlock(block_id)) && + ir::GetLoopExtent(loops[current_reduce_axis]) != 1) { ir::Expr rf_tensor = sch->FactorizeReduction(loops[current_reduce_axis], /* rf_axis = */ 0, @@ -383,7 +386,8 @@ void TileFirstGeneralTactic::SplitReduceInner(ir::IRSchedule* sch, sch->Split(loops[2], std::vector{16, -1}); loops = sch->GetLoops(block_id); - if (IsReductionSBlock(sch->GetBlock(block_id))) { + if (IsReductionSBlock(sch->GetBlock(block_id)) && + ir::GetLoopExtent(loops[2]) != 1) { ir::Expr rf_tensor = sch->FactorizeReduction(loops[2], 0, diff --git a/paddle/cinn/optim/replace_cross_thread_reduction.cc b/paddle/cinn/optim/replace_cross_thread_reduction.cc index a449f67d2ad02..f5706b1831aac 100644 --- a/paddle/cinn/optim/replace_cross_thread_reduction.cc +++ b/paddle/cinn/optim/replace_cross_thread_reduction.cc @@ -85,8 +85,11 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { std::vector thread_binded_reduce_loop_indices; bool is_thread_binded_inner_loop = false; for (int i = 0; i < cur_loops_.size(); ++i) { - if (is_thread_binded_inner_loop || - IsThreadBindOnReduceAxis(cur_loops_[i].As())) { + bool is_thread_bind_on_reduce = + IsThreadBindOnReduceAxis(cur_loops_[i].As()); + if (is_thread_bind_on_reduce && ir::GetLoopExtent(cur_loops_[i]) == 1) + return false; + if (is_thread_binded_inner_loop || is_thread_bind_on_reduce) { if (ir::GetLoopExtent(cur_loops_[i]) > 1024) { return false; } From 159aa581d5263abdf371713b2df53d6061c96759 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 15 Oct 2024 14:16:19 +0800 Subject: [PATCH 135/135] [Inference]Add dropout and depthwise_conv2d (#68593) * add dropout and depthwise_conv2d * add unittest * perfect utils * re trigger * re trigger * re trigger --- python/paddle/tensorrt/converter.py | 5 +-- python/paddle/tensorrt/converter_utils.py | 5 ++- python/paddle/tensorrt/impls/common.py | 36 +++++++++++++++++- python/paddle/tensorrt/impls/conv.py | 1 + python/paddle/tensorrt/util.py | 5 +++ test/tensorrt/test_converter_common.py | 45 +++++++++++++++++++++++ test/tensorrt/test_converter_conv.py | 17 +++++++++ 7 files changed, 107 insertions(+), 7 deletions(-) diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 19f3aebb07116..eec4c3d553326 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -44,10 +44,9 @@ from .impls.search import * # noqa: F403 from .impls.stat import * # noqa: F403 from .register import converter_registry -from .util import map_dtype +from .util import get_trt_version_list, map_dtype -version = trt.__version__ -version_list = list(map(int, version.split('.'))) +version_list = get_trt_version_list() def get_cache_path(): diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py index 337d876b3df2a..21f11cbc24bb1 100644 --- a/python/paddle/tensorrt/converter_utils.py +++ b/python/paddle/tensorrt/converter_utils.py @@ -19,6 +19,8 @@ import numpy as np import tensorrt as trt +from .util import get_trt_version_list + current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir)) if parent_dir not in sys.path: @@ -31,8 +33,7 @@ __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' ) -version = trt.__version__ -version_list = list(map(int, version.split('.'))) +version_list = get_trt_version_list() def has_dynamic_shape(shape): diff --git a/python/paddle/tensorrt/impls/common.py b/python/paddle/tensorrt/impls/common.py index 5fd08d0c0ecfa..ce6ac2ce9130a 100644 --- a/python/paddle/tensorrt/impls/common.py +++ b/python/paddle/tensorrt/impls/common.py @@ -12,10 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. + +import numpy as np import tensorrt as trt from paddle.tensorrt.converter_utils import get_shape_tensor_element from paddle.tensorrt.register import converter_registry +from paddle.tensorrt.util import get_trt_version_list + + +@converter_registry.register("pd_op.dropout", trt_version="8.x") +def dropout_converter(network, paddle_op, inputs): + input_x = inputs[0] + p_defining_op = paddle_op.operands()[2].source().get_defining_op() + dropout_prob = p_defining_op.attrs()["value"] + downgrade_in_infer = paddle_op.attrs().get("mode") + + if downgrade_in_infer == "upscale_in_train": + shuffle_layer = network.add_shuffle(input_x) + return shuffle_layer.get_output(0) + + weight_data = np.array([1 - dropout_prob]).astype("float32") + scale_weights = trt.Weights(weight_data) + shift_weights = trt.Weights(np.array([0]).astype("float32")) + power_weights = trt.Weights(np.array([1]).astype("float32")) + + scale_layer = network.add_scale( + input_x, + mode=trt.ScaleMode.UNIFORM, + shift=shift_weights, + scale=scale_weights, + power=power_weights, + ) + + return scale_layer.get_output(0) @converter_registry.register("pd_op.bilinear_interp", trt_version="8.x") @@ -31,7 +61,8 @@ def bilinear_interp_converter(network, paddle_op, inputs): out_d = paddle_op.attrs().get("out_d") scale_attr = paddle_op.attrs().get("scale") - trt_major, trt_minor, trt_patch = trt.__version__.split(".") + trt_major = get_trt_version_list()[0] + trt_minor = get_trt_version_list()[1] trt_version_float = float(f"{trt_major}.{trt_minor}") resize_layer = network.add_resize(input_tensor) @@ -145,7 +176,8 @@ def nearest_interp_converter(network, paddle_op, inputs): scale_attr = paddle_op.attrs().get("scale") # Parse TensorRT version - trt_major, trt_minor, trt_patch = trt.__version__.split(".") + trt_major = get_trt_version_list()[0] + trt_minor = get_trt_version_list()[1] trt_version_float = float(f"{trt_major}.{trt_minor}") # Create Resize layer diff --git a/python/paddle/tensorrt/impls/conv.py b/python/paddle/tensorrt/impls/conv.py index bce8ef36b1d88..71174ec87a3e1 100644 --- a/python/paddle/tensorrt/impls/conv.py +++ b/python/paddle/tensorrt/impls/conv.py @@ -18,6 +18,7 @@ from paddle.tensorrt.register import converter_registry +@converter_registry.register("pd_op.depthwise_conv2d", trt_version="8.x") @converter_registry.register("pd_op.conv2d", trt_version="8.x") def conv2d_converter(network, paddle_op, inputs): input_tensor, weight = inputs diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py index 8f50744fb9adc..717a7c14ac4a6 100644 --- a/python/paddle/tensorrt/util.py +++ b/python/paddle/tensorrt/util.py @@ -112,6 +112,11 @@ def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None): return exe_program +def get_trt_version_list(): + version = trt.__version__ + return list(map(int, version.split('.'))) + + # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph. def mark_buitlin_op(program): for op in program.global_block().ops: diff --git a/test/tensorrt/test_converter_common.py b/test/tensorrt/test_converter_common.py index da886bb623c3d..1c320374b3f52 100644 --- a/test/tensorrt/test_converter_common.py +++ b/test/tensorrt/test_converter_common.py @@ -21,6 +21,51 @@ from paddle import _C_ops +def dropout_wrapper(x, p, mode): + out = _C_ops.dropout( + x, + None, + p, + True, + mode, + 0, + True, + ) + return out + + +class TestDropoutWithUpscaleModeTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = dropout_wrapper + self.api_args = { + "x": np.random.random([1, 2, 3]).astype("float32"), + "p": 0, + "mode": "upscale_in_train", + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 2, 3]} + self.max_shape = {"x": [10, 2, 3]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestDropoutWithDowngradeModeTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = dropout_wrapper + self.api_args = { + "x": np.random.random([1, 2, 3]).astype("float32"), + "p": 0, + "mode": "downgrade_in_infer", + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 2, 3]} + self.max_shape = {"x": [10, 2, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def upsample_bilinear(x): upsample = paddle.nn.Upsample(size=[12, 12], mode="bilinear") return upsample(x) diff --git a/test/tensorrt/test_converter_conv.py b/test/tensorrt/test_converter_conv.py index d5be5dd56cf23..0607930f67967 100644 --- a/test/tensorrt/test_converter_conv.py +++ b/test/tensorrt/test_converter_conv.py @@ -37,5 +37,22 @@ def test_trt_result(self): self.check_trt_result() +def depthwise_conv2d_wrapper(x): + conv = paddle.nn.Conv2D(2, 2, (3, 3), groups=2) + return conv(x) + + +class TestDepthwiseConv2dTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = depthwise_conv2d_wrapper + self.api_args = {"x": np.random.random([3, 2, 8, 8]).astype("float32")} + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 2, 8, 8]} + self.max_shape = {"x": [10, 2, 8, 8]} + + def test_trt_result(self): + self.check_trt_result() + + if __name__ == '__main__': unittest.main()