Skip to content

Commit

Permalink
FusedMultiTransformer optimization (#59385)
Browse files Browse the repository at this point in the history
* merge fuse_mt

* fix build error

* merge fuse_mt

* fix build ci error

* fix build ci error

* fix build ci error

* fix build ci error

* fix ci build cutlass version

* skip ci unittest
  • Loading branch information
RichardWooSJTU authored Dec 8, 2023
1 parent f27f769 commit 6279a67
Show file tree
Hide file tree
Showing 81 changed files with 26,126 additions and 2,945 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,6 @@ paddle/phi/api/profiler/__init__.py
python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
paddle/phi/kernels/fusion/cutlass/conv2d/generated/*
python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py

# these files are auto-generated by memory_efficient_fmha_variable
autogen*
9 changes: 9 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
input_ptr = t->mutable_data<int32_t>(ddim, place);
} else if (pt.dtype == PaddleDType::FLOAT16) {
input_ptr = t->mutable_data<float16>(ddim, place);
} else if (pt.dtype == PaddleDType::BFLOAT16) {
input_ptr = t->mutable_data<bfloat16>(ddim, place);
} else {
LOG(ERROR) << "unsupported feed type " << pt.dtype;
return false;
Expand Down Expand Up @@ -1226,6 +1228,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
} else if (type == framework::proto::VarType::FP16) {
GetFetchOne<float16>(fetch, output);
output->dtype = PaddleDType::FLOAT16;
} else if (type == framework::proto::VarType::BF16) {
GetFetchOne<bfloat16>(fetch, output);
output->dtype = PaddleDType::BFLOAT16;
} else {
LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
"int32 now.";
Expand Down Expand Up @@ -1766,6 +1771,8 @@ AnalysisPredictor::GetInputTypes() {
input_type[name] = paddle_infer::DataType::FLOAT32;
} else if (dtype == paddle::framework::proto::VarType::FP16) {
input_type[name] = paddle_infer::DataType::FLOAT16;
} else if (dtype == paddle::framework::proto::VarType::BF16) {
input_type[name] = paddle_infer::DataType::BFLOAT16;
} else if (dtype == paddle::framework::proto::VarType::INT64) {
input_type[name] = paddle_infer::DataType::INT64;
} else if (dtype == paddle::framework::proto::VarType::INT32) {
Expand Down Expand Up @@ -1819,6 +1826,8 @@ AnalysisPredictor::GetOutputTypes() {
output_type[name] = paddle_infer::DataType::FLOAT32;
} else if (dtype == paddle::framework::proto::VarType::FP16) {
output_type[name] = paddle_infer::DataType::FLOAT16;
} else if (dtype == paddle::framework::proto::VarType::BF16) {
output_type[name] = paddle_infer::DataType::BFLOAT16;
} else if (dtype == paddle::framework::proto::VarType::INT64) {
output_type[name] = paddle_infer::DataType::INT64;
} else if (dtype == paddle::framework::proto::VarType::INT32) {
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,18 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/resource_manager.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/dense_tensor.h"
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#include <gtest/gtest_prod.h>
#endif

namespace paddle_infer {
using float16 = paddle::platform::float16;
using bfloat16 = phi::dtype::bfloat16;
namespace experimental {
class InternalUtils;
};
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
switch (dtype) {
case PaddleDType::FLOAT32:
return sizeof(float);
case PaddleDType::BFLOAT16:
return sizeof(uint16_t);
case PaddleDType::INT64:
return sizeof(int64_t);
case PaddleDType::INT32:
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
input_ptr = input.mutable_data<float>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::INT32) {
input_ptr = input.mutable_data<int32_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
input_ptr = input.mutable_data<bfloat16>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
Expand Down
30 changes: 30 additions & 0 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
namespace paddle_infer {

using float16 = paddle::platform::float16;
using bfloat16 = paddle::platform::bfloat16;

void Tensor::Reshape(const std::vector<int> &shape) {
#ifdef PADDLE_WITH_ONNXRUNTIME
Expand Down Expand Up @@ -178,6 +179,8 @@ DataType Tensor::type() const {
return DataType::FLOAT32;
} else if (type == paddle::framework::proto::VarType::FP16) {
return DataType::FLOAT16;
} else if (type == paddle::framework::proto::VarType::BF16) {
return DataType::BFLOAT16;
} else if (type == paddle::framework::proto::VarType::INT64) {
return DataType::INT64;
} else if (type == paddle::framework::proto::VarType::INT32) {
Expand Down Expand Up @@ -289,6 +292,11 @@ struct DataTypeInfo<float16> {
phi::DataType TYPE = phi::DataType::FLOAT16;
};

template <>
struct DataTypeInfo<bfloat16> {
phi::DataType TYPE = phi::DataType::BFLOAT16;
};

template <>
struct DataTypeInfo<int64_t> {
phi::DataType TYPE = phi::DataType::INT64;
Expand Down Expand Up @@ -500,6 +508,7 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<bfloat16>(const bfloat16 *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<bool>(const bool *data);

template PD_INFER_DECL void Tensor::ShareExternalData<double>(
Expand Down Expand Up @@ -537,6 +546,11 @@ template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
const std::vector<int> &shape,
PlaceType place,
DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<bfloat16>(
const bfloat16 *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<bool>(
const bool *data,
const std::vector<int> &shape,
Expand All @@ -550,6 +564,7 @@ template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<bfloat16>(bfloat16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<bool>(bool *data) const;

template PD_INFER_DECL void Tensor::CopyToCpuImpl<double>(
Expand All @@ -568,6 +583,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuImpl<int8_t>(
int8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<float16>(
float16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bfloat16>(
bfloat16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bool>(bool *data,
void *exec_stream,
CallbackFunc cb,
Expand All @@ -587,6 +604,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
bfloat16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(
bool *data, void *exec_stream) const;

Expand All @@ -604,6 +623,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
bfloat16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(bool *data,
CallbackFunc cb,
void *cb_params) const;
Expand All @@ -622,6 +643,8 @@ template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL bfloat16 *Tensor::data<bfloat16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL bool *Tensor::data<bool>(PlaceType *place,
int *size) const;

Expand All @@ -632,6 +655,8 @@ template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
template PD_INFER_DECL bfloat16 *Tensor::mutable_data<bfloat16>(
PlaceType place);
template PD_INFER_DECL bool *Tensor::mutable_data<bool>(PlaceType place);

Tensor::Tensor(void *scope, const void *device_contexts)
Expand Down Expand Up @@ -783,6 +808,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
#endif

namespace experimental {
Expand Down Expand Up @@ -921,6 +947,8 @@ template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<float16>(
paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<bfloat16>(
paddle_infer::Tensor *t, const bfloat16 *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<bool>(
paddle_infer::Tensor *t, const bool *data, cudaStream_t stream);

Expand All @@ -938,6 +966,8 @@ template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<float16>(
paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<bfloat16>(
paddle_infer::Tensor *t, bfloat16 *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<bool>(
paddle_infer::Tensor *t, bool *data, cudaStream_t stream);

Expand Down
14 changes: 14 additions & 0 deletions paddle/fluid/inference/api/paddle_infer_contrib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
cb,
cb_params);
break;
case PaddleDType::BFLOAT16:
src.CopyToCpuImpl(
dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kCPU),
exec_stream,
cb,
cb_params);
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
Expand Down Expand Up @@ -172,6 +179,13 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
src.data<paddle::platform::float16>(&src_place, &data_size));
data_len = data_size * 2;
break;
case PaddleDType::BFLOAT16:
dst_data = static_cast<void*>(
dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kGPU));
src_data = static_cast<void*>(
src.data<paddle::platform::bfloat16>(&src_place, &data_size));
data_len = data_size * 2;
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/paddle_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ enum DataType {
FLOAT16,
BOOL,
FLOAT64,
BFLOAT16,
// TODO(Inference): support more data types if needed.
};

Expand Down
Loading

0 comments on commit 6279a67

Please sign in to comment.