Skip to content

Commit

Permalink
Merge branch 'TensorClip' of https://github.com/a162837/Paddle into T…
Browse files Browse the repository at this point in the history
…ensorClip
  • Loading branch information
a162837 committed Nov 13, 2024
2 parents 8b31397 + 6a806f2 commit cd2738f
Show file tree
Hide file tree
Showing 285 changed files with 2,953 additions and 901 deletions.
92 changes: 61 additions & 31 deletions cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(XPU_PROJECT "extern_xpu")
set(XPU_API_LIB_NAME "libxpuapi.so")
set(XPU_RT_LIB_NAME "libxpurt.so")
set(XPU_CUDA_LIB_NAME "libxpucuda.so")
set(XPU_CUDA_RT_LIB_NAME "libcudart.so")
set(XPU_XFT_LIB_NAME "libxft.so")
set(XPU_XPTI_LIB_NAME "libxpti.so")
set(XPU_XBLAS_LIB_NAME "libxpu_blas.so")
Expand All @@ -31,7 +32,7 @@ if(NOT DEFINED XPU_XRE_BASE_VERSION)
set(XPU_XRE_BASE_VERSION "4.32.0.1")
endif()
if(NOT DEFINED XPU_XHPC_BASE_DATE)
set(XPU_XHPC_BASE_DATE "eb35/20241104")
set(XPU_XHPC_BASE_DATE "dev/20241113")
endif()
set(XPU_XCCL_BASE_VERSION "1.2.11e")
if(NOT DEFINED XPU_XFT_BASE_VERSION)
Expand Down Expand Up @@ -139,6 +140,7 @@ set(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
set(XPU_XBLAS_LIB "${XPU_LIB_DIR}/${XPU_XBLAS_LIB_NAME}")
set(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
set(XPU_CUDA_LIB "${XPU_LIB_DIR}/${XPU_CUDA_LIB_NAME}")
set(XPU_CUDA_RT_LIB "${XPU_LIB_DIR}/${XPU_CUDA_RT_LIB_NAME}")
set(XPU_XFA_LIB "${XPU_LIB_DIR}/${XPU_XFA_LIB_NAME}")
set(XPU_XPUDNN_LIB "${XPU_LIB_DIR}/${XPU_XPUDNN_LIB_NAME}")

Expand All @@ -160,29 +162,51 @@ if(WITH_XPU_BKCL)
include_directories(${XPU_BKCL_INC_DIR})
endif()

ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${SNAPPY_PREFIX_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND
bash ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_dependence.sh ${XPU_XRE_URL}
${XPU_XRE_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME} ${XPU_XCCL_URL}
${XPU_XCCL_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME} && wget
${XPU_XFT_GET_DEPENCE_URL} && bash get_xft_dependence.sh ${XPU_XFT_URL}
${XPU_XFT_DIR_NAME} && bash
${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL}
${XPU_XPTI_DIR_NAME}
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
BUILD_BYPRODUCTS ${XPU_API_LIB}
BUILD_BYPRODUCTS ${XPU_XBLAS_LIB}
BUILD_BYPRODUCTS ${XPU_XPUDNN_LIB}
BUILD_BYPRODUCTS ${XPU_XFA_LIB}
BUILD_BYPRODUCTS ${XPU_RT_LIB}
BUILD_BYPRODUCTS ${XPU_BKCL_LIB})
if(WITH_XPU_XRE5)
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${SNAPPY_PREFIX_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND
bash ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_dependence.sh
${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME}
${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 && wget ${XPU_XFT_GET_DEPENCE_URL}
&& bash get_xft_dependence.sh ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash
${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL}
${XPU_XPTI_DIR_NAME}
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
BUILD_BYPRODUCTS ${XPU_API_LIB}
BUILD_BYPRODUCTS ${XPU_XBLAS_LIB}
BUILD_BYPRODUCTS ${XPU_XPUDNN_LIB}
BUILD_BYPRODUCTS ${XPU_XFA_LIB}
BUILD_BYPRODUCTS ${XPU_RT_LIB}
BUILD_BYPRODUCTS ${XPU_CUDA_RT_LIB}
BUILD_BYPRODUCTS ${XPU_BKCL_LIB})
else()
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${SNAPPY_PREFIX_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND
bash ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_dependence.sh
${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME}
${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 0 && wget ${XPU_XFT_GET_DEPENCE_URL}
&& bash get_xft_dependence.sh ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash
${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL}
${XPU_XPTI_DIR_NAME}
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
BUILD_BYPRODUCTS ${XPU_API_LIB}
BUILD_BYPRODUCTS ${XPU_RT_LIB}
BUILD_BYPRODUCTS ${XPU_BKCL_LIB})
endif()

include_directories(${XPU_INC_DIR})
add_library(shared_xpuapi SHARED IMPORTED GLOBAL)
Expand All @@ -192,7 +216,11 @@ set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
# for cc_library(xxx SRCS xxx.c DEPS xpulib)
generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")

target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
if(WITH_XPU_XRE5)
target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_CUDA_RT_LIB})
else()
target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
endif()

if(WITH_XPU_XFT)
message(STATUS "Compile with XPU XFT!")
Expand All @@ -205,15 +233,17 @@ endif()

set(XPU_XHPC_INC_DIR "${XPU_INC_DIR}/xhpc")
include_directories(${XPU_XHPC_INC_DIR})
set(XPU_XBLAS_INC_DIR "${XPU_INC_DIR}/xhpc/xblas")
include_directories(${XPU_XBLAS_INC_DIR})
set(XPU_XPUDNN_INC_DIR "${XPU_INC_DIR}/xhpc/xpudnn")
include_directories(${XPU_XPUDNN_INC_DIR})
set(XPU_XRE_INC_DIR "${XPU_INC_DIR}/xre")
include_directories(${XPU_XRE_INC_DIR})

if(WITH_XPU_XRE5)
add_definitions(-DPADDLE_WITH_XPU_XRE5)
set(XPU_XBLAS_INC_DIR "${XPU_INC_DIR}/xhpc/xblas")
include_directories(${XPU_XBLAS_INC_DIR})
set(XPU_XFA_INC_DIR "${XPU_INC_DIR}/xhpc/xfa")
include_directories(${XPU_XFA_INC_DIR})
set(XPU_XPUDNN_INC_DIR "${XPU_INC_DIR}/xhpc/xpudnn")
include_directories(${XPU_XPUDNN_INC_DIR})
endif()

if(WITH_XPTI)
Expand All @@ -236,14 +266,14 @@ if(WITH_XPU_XRE5)
target_link_libraries(
xpulib
${XPU_RT_LIB}
${XPU_CUDA_RT_LIB}
${XPU_BKCL_LIB}
${XPU_XBLAS_LIB}
${XPU_API_LIB}
${XPU_XFA_LIB}
${XPU_XPUDNN_LIB})
else()
target_link_libraries(xpulib ${XPU_RT_LIB} ${XPU_BKCL_LIB} ${XPU_XBLAS_LIB}
${XPU_API_LIB})
target_link_libraries(xpulib ${XPU_RT_LIB} ${XPU_BKCL_LIB} ${XPU_API_LIB})
endif()

add_dependencies(xpulib ${XPU_PROJECT})
Expand Down
28 changes: 28 additions & 0 deletions paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,29 @@ void DebugPrintReduceVar(const FusibleOp& op) {
}
}

ir::Expr GetBaseVariableExpr(const ir::Expr& expr) {
const auto GetBase =
[&](const ir::Expr& base, const ir::Expr& a, const ir::Expr& b) {
if (a.is_constant()) {
return b;
} else if (b.is_constant()) {
return a;
}

return base;
};

if (auto max_op = expr.As<ir::Max>()) {
return GetBase(expr, max_op->a(), max_op->b());
} else if (auto min_op = expr.As<ir::Min>()) {
return GetBase(expr, min_op->a(), min_op->b());
} else if (auto add_op = expr.As<ir::Add>()) {
return GetBase(expr, add_op->a(), add_op->b());
}

return expr;
}

std::pair<TrivialOp, ReduceOp> SplitReduceOp(const ReduceOp& reduce_op) {
VLOG(4) << "Start SplitReduceOp";
VLOG(4) << "DebugPrint Op Origin: " << _GetRootExpr(reduce_op);
Expand All @@ -502,6 +525,11 @@ std::pair<TrivialOp, ReduceOp> SplitReduceOp(const ReduceOp& reduce_op) {

const std::vector<ir::Var>& all_iters = ComposeUtils::ConcatVector(
GetOutputIters(reduce_op), GetReduceIters(reduce_op));

// TODO(phlrain): trivial_compute_body contain cinn_max(-inf, var[i])
// only need keep var[i].
// Need to remove split transform
trivial_compute_body = GetBaseVariableExpr(trivial_compute_body);
VLOG(4) << "Trivial Compute Body is " << trivial_compute_body;
ir::Tensor new_trivial_tensor =
ir::Tensor(reduce_out_tensor->name + "_split_transform",
Expand Down
28 changes: 21 additions & 7 deletions paddle/cinn/hlir/op/elementwise.cc
Original file line number Diff line number Diff line change
Expand Up @@ -326,19 +326,33 @@ std::shared_ptr<OpStrategy> StrategyForScaleSymbolic(
out = Compute(
A->shape,
[=](const std::vector<Expr> &indice) {
Expr cast_A_indice =
should_upscale_fp32
? ir::Cast::Make(cinn::common::F32(), A(indice))
: A(indice);

Expr cast_scale = should_upscale_fp32
? Expr(scale)
: ir::Cast::Make(A->type(), Expr(scale));
Expr cast_bias = should_upscale_fp32
? Expr(bias)
: ir::Cast::Make(A->type(), Expr(bias));
Expr cast_A_indice =
should_upscale_fp32
? ir::Cast::Make(cinn::common::F32(), A(indice))
: A(indice);
Expr add_result = bias_after_scale
? cast_scale * cast_A_indice + cast_bias
: cast_scale * (cast_A_indice + cast_bias);
Expr add_result;
if (scale == 1.0f) {
if (bias == 0.0f) {
add_result = cast_A_indice;
} else {
add_result = cast_A_indice + cast_bias;
}
} else {
if (bias == 0.0f) {
add_result = cast_scale * cast_A_indice;
} else {
add_result = bias_after_scale
? cast_scale * cast_A_indice + cast_bias
: cast_scale * (cast_A_indice + cast_bias);
}
}
return should_upscale_fp32 ? ir::Cast::Make(A->type(), add_result)
: add_result;
},
Expand Down
11 changes: 11 additions & 0 deletions paddle/cinn/ir/ir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -926,9 +926,20 @@ Expr Load::Make(Expr tensor, const std::vector<Expr> &origin_indices) {

void Load::convert_int32_to_int64() {
IrNode::convert_int32_to_int64();
for (auto &indice : indices) {
indice->convert_int32_to_int64();
}
tensor->convert_int32_to_int64();
}

void Load::convert_int64_to_int32() {
IrNode::convert_int64_to_int32();
for (auto &indice : indices) {
indice->convert_int64_to_int32();
}
tensor->convert_int64_to_int32();
}

Type Load::type() const {
PADDLE_ENFORCE_EQ(
tensor.defined(),
Expand Down
2 changes: 2 additions & 0 deletions paddle/cinn/ir/ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,8 @@ struct Load : public ExprNode<Load>, public LoadStoreAddrMnger {

void convert_int32_to_int64() override;

void convert_int64_to_int32() override;

static const IrNodeTy _node_type_ = IrNodeTy::Load;
};

Expand Down
26 changes: 25 additions & 1 deletion paddle/cinn/ir/ir_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ bool Expr::is_index() const {
case ir::IrNodeTy::Cast:
[[fallthrough]];
case ir::IrNodeTy::_Var_:
return true;
[[fallthrough]];
case ir::IrNodeTy::IntImm: {
if (type().is_index_type()) return true;
}
Expand Down Expand Up @@ -346,6 +346,30 @@ void IrNode::convert_int32_to_int64() {
}
}

void IrNode::convert_int64_to_int32() {
if (type_ != Int(64) && type_ != UInt(64))
if (type_ != Int(32) && type_ != UInt(32))
PADDLE_ENFORCE_EQ(type_.is_unk(),
true,
::common::errors::InvalidArgument(
"Current only support convert int64_t "
"to int32_t, but get type is: %s",
type_));

if (node_type() == IrNodeTy::IntImm) {
auto *int_imm = static_cast<IntImm *>(this);
if (int_imm->value >= INT_MAX) return;
int_imm->value = int32_t(int_imm->value);
}

if (type_ == Int(64)) type_ = Int(32);
if (type_ == UInt(64)) type_ = UInt(32);

for (Expr &operand : operands) {
operand->convert_int64_to_int32();
}
}

void TryElevateInt32ToInt64(const std::vector<Expr> &expr_vec) {
Type type = expr_vec.front()->type();
for (const Expr &expr : expr_vec) {
Expand Down
3 changes: 3 additions & 0 deletions paddle/cinn/ir/ir_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ class IrNode : public cinn::common::Object {
//! Elevate int32 to int64 if needed
virtual void convert_int32_to_int64();

//! Elevate int64 to int32 if needed
virtual void convert_int64_to_int32();

virtual void replace(Expr old_op, Expr new_op);
//! Get i-th operand
const Expr& operand(int i);
Expand Down
3 changes: 2 additions & 1 deletion paddle/cinn/optim/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ gather_srcs(
merge_block_utils.cc
eliminate_common_global_memory_read.cc
rearrange_load_instruction.cc
check_tensor_buffer_map.cc)
check_tensor_buffer_map.cc
longlong2int.cc)

if(WITH_CUDA OR WITH_ROCM)
gather_srcs(cinnapi_src SRCS transform_gpu_forloop.cc)
Expand Down
Loading

0 comments on commit cd2738f

Please sign in to comment.