Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FusedMultiTransformer optimization #59385

Merged
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,6 @@ paddle/phi/api/profiler/__init__.py
python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
paddle/phi/kernels/fusion/cutlass/conv2d/generated/*
python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py

# these files are auto-generated by memory_efficient_fmha_variable
autogen*
9 changes: 9 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
input_ptr = t->mutable_data<int32_t>(ddim, place);
} else if (pt.dtype == PaddleDType::FLOAT16) {
input_ptr = t->mutable_data<float16>(ddim, place);
} else if (pt.dtype == PaddleDType::BFLOAT16) {
input_ptr = t->mutable_data<bfloat16>(ddim, place);
} else {
LOG(ERROR) << "unsupported feed type " << pt.dtype;
return false;
Expand Down Expand Up @@ -1226,6 +1228,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
} else if (type == framework::proto::VarType::FP16) {
GetFetchOne<float16>(fetch, output);
output->dtype = PaddleDType::FLOAT16;
} else if (type == framework::proto::VarType::BF16) {
GetFetchOne<bfloat16>(fetch, output);
output->dtype = PaddleDType::BFLOAT16;
} else {
LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
"int32 now.";
Expand Down Expand Up @@ -1766,6 +1771,8 @@ AnalysisPredictor::GetInputTypes() {
input_type[name] = paddle_infer::DataType::FLOAT32;
} else if (dtype == paddle::framework::proto::VarType::FP16) {
input_type[name] = paddle_infer::DataType::FLOAT16;
} else if (dtype == paddle::framework::proto::VarType::BF16) {
input_type[name] = paddle_infer::DataType::BFLOAT16;
} else if (dtype == paddle::framework::proto::VarType::INT64) {
input_type[name] = paddle_infer::DataType::INT64;
} else if (dtype == paddle::framework::proto::VarType::INT32) {
Expand Down Expand Up @@ -1819,6 +1826,8 @@ AnalysisPredictor::GetOutputTypes() {
output_type[name] = paddle_infer::DataType::FLOAT32;
} else if (dtype == paddle::framework::proto::VarType::FP16) {
output_type[name] = paddle_infer::DataType::FLOAT16;
} else if (dtype == paddle::framework::proto::VarType::BF16) {
output_type[name] = paddle_infer::DataType::BFLOAT16;
} else if (dtype == paddle::framework::proto::VarType::INT64) {
output_type[name] = paddle_infer::DataType::INT64;
} else if (dtype == paddle::framework::proto::VarType::INT32) {
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,18 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/resource_manager.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/core/dense_tensor.h"
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#include <gtest/gtest_prod.h>
#endif

namespace paddle_infer {
using float16 = paddle::platform::float16;
using bfloat16 = phi::dtype::bfloat16;
namespace experimental {
class InternalUtils;
};
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
switch (dtype) {
case PaddleDType::FLOAT32:
return sizeof(float);
case PaddleDType::BFLOAT16:
return sizeof(uint16_t);
case PaddleDType::INT64:
return sizeof(int64_t);
case PaddleDType::INT32:
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
input_ptr = input.mutable_data<float>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::INT32) {
input_ptr = input.mutable_data<int32_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
input_ptr = input.mutable_data<bfloat16>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
Expand Down
30 changes: 30 additions & 0 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
namespace paddle_infer {

using float16 = paddle::platform::float16;
using bfloat16 = paddle::platform::bfloat16;

void Tensor::Reshape(const std::vector<int> &shape) {
#ifdef PADDLE_WITH_ONNXRUNTIME
Expand Down Expand Up @@ -178,6 +179,8 @@ DataType Tensor::type() const {
return DataType::FLOAT32;
} else if (type == paddle::framework::proto::VarType::FP16) {
return DataType::FLOAT16;
} else if (type == paddle::framework::proto::VarType::BF16) {
return DataType::BFLOAT16;
} else if (type == paddle::framework::proto::VarType::INT64) {
return DataType::INT64;
} else if (type == paddle::framework::proto::VarType::INT32) {
Expand Down Expand Up @@ -289,6 +292,11 @@ struct DataTypeInfo<float16> {
phi::DataType TYPE = phi::DataType::FLOAT16;
};

template <>
struct DataTypeInfo<bfloat16> {
phi::DataType TYPE = phi::DataType::BFLOAT16;
};

template <>
struct DataTypeInfo<int64_t> {
phi::DataType TYPE = phi::DataType::INT64;
Expand Down Expand Up @@ -500,6 +508,7 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<bfloat16>(const bfloat16 *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<bool>(const bool *data);

template PD_INFER_DECL void Tensor::ShareExternalData<double>(
Expand Down Expand Up @@ -537,6 +546,11 @@ template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
const std::vector<int> &shape,
PlaceType place,
DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<bfloat16>(
const bfloat16 *data,
const std::vector<int> &shape,
PlaceType place,
DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<bool>(
const bool *data,
const std::vector<int> &shape,
Expand All @@ -550,6 +564,7 @@ template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<bfloat16>(bfloat16 *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<bool>(bool *data) const;

template PD_INFER_DECL void Tensor::CopyToCpuImpl<double>(
Expand All @@ -568,6 +583,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuImpl<int8_t>(
int8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<float16>(
float16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bfloat16>(
bfloat16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bool>(bool *data,
void *exec_stream,
CallbackFunc cb,
Expand All @@ -587,6 +604,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
bfloat16 *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(
bool *data, void *exec_stream) const;

Expand All @@ -604,6 +623,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
bfloat16 *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(bool *data,
CallbackFunc cb,
void *cb_params) const;
Expand All @@ -622,6 +643,8 @@ template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL bfloat16 *Tensor::data<bfloat16>(PlaceType *place,
int *size) const;
template PD_INFER_DECL bool *Tensor::data<bool>(PlaceType *place,
int *size) const;

Expand All @@ -632,6 +655,8 @@ template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
template PD_INFER_DECL bfloat16 *Tensor::mutable_data<bfloat16>(
PlaceType place);
template PD_INFER_DECL bool *Tensor::mutable_data<bool>(PlaceType place);

Tensor::Tensor(void *scope, const void *device_contexts)
Expand Down Expand Up @@ -783,6 +808,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
#endif

namespace experimental {
Expand Down Expand Up @@ -921,6 +947,8 @@ template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<float16>(
paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<bfloat16>(
paddle_infer::Tensor *t, const bfloat16 *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<bool>(
paddle_infer::Tensor *t, const bool *data, cudaStream_t stream);

Expand All @@ -938,6 +966,8 @@ template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<float16>(
paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<bfloat16>(
paddle_infer::Tensor *t, bfloat16 *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<bool>(
paddle_infer::Tensor *t, bool *data, cudaStream_t stream);

Expand Down
14 changes: 14 additions & 0 deletions paddle/fluid/inference/api/paddle_infer_contrib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
cb,
cb_params);
break;
case PaddleDType::BFLOAT16:
src.CopyToCpuImpl(
dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kCPU),
exec_stream,
cb,
cb_params);
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
Expand Down Expand Up @@ -172,6 +179,13 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
src.data<paddle::platform::float16>(&src_place, &data_size));
data_len = data_size * 2;
break;
case PaddleDType::BFLOAT16:
dst_data = static_cast<void*>(
dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kGPU));
src_data = static_cast<void*>(
src.data<paddle::platform::bfloat16>(&src_place, &data_size));
data_len = data_size * 2;
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/paddle_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ enum DataType {
FLOAT16,
BOOL,
FLOAT64,
BFLOAT16,
// TODO(Inference): support more data types if needed.
};

Expand Down
Loading