PaddlePaddle · sneaxiy · Dec 8, 2023 · Nov 27, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/.gitignore b/.gitignore
@@ -96,3 +96,6 @@ paddle/phi/api/profiler/__init__.py
 python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
 paddle/phi/kernels/fusion/cutlass/conv2d/generated/*
 python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
+
+# these files are auto-generated by memory_efficient_fmha_variable
+autogen*
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -169,6 +169,8 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
     input_ptr = t->mutable_data<int32_t>(ddim, place);
   } else if (pt.dtype == PaddleDType::FLOAT16) {
     input_ptr = t->mutable_data<float16>(ddim, place);
+  } else if (pt.dtype == PaddleDType::BFLOAT16) {
+    input_ptr = t->mutable_data<bfloat16>(ddim, place);
   } else {
     LOG(ERROR) << "unsupported feed type " << pt.dtype;
     return false;
@@ -1226,6 +1228,9 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     } else if (type == framework::proto::VarType::FP16) {
       GetFetchOne<float16>(fetch, output);
       output->dtype = PaddleDType::FLOAT16;
+    } else if (type == framework::proto::VarType::BF16) {
+      GetFetchOne<bfloat16>(fetch, output);
+      output->dtype = PaddleDType::BFLOAT16;
     } else {
       LOG(ERROR) << "unknown type, only support float32, float16, int64 and "
                     "int32 now.";
@@ -1766,6 +1771,8 @@ AnalysisPredictor::GetInputTypes() {
       input_type[name] = paddle_infer::DataType::FLOAT32;
     } else if (dtype == paddle::framework::proto::VarType::FP16) {
       input_type[name] = paddle_infer::DataType::FLOAT16;
+    } else if (dtype == paddle::framework::proto::VarType::BF16) {
+      input_type[name] = paddle_infer::DataType::BFLOAT16;
     } else if (dtype == paddle::framework::proto::VarType::INT64) {
       input_type[name] = paddle_infer::DataType::INT64;
     } else if (dtype == paddle::framework::proto::VarType::INT32) {
@@ -1819,6 +1826,8 @@ AnalysisPredictor::GetOutputTypes() {
       output_type[name] = paddle_infer::DataType::FLOAT32;
     } else if (dtype == paddle::framework::proto::VarType::FP16) {
       output_type[name] = paddle_infer::DataType::FLOAT16;
+    } else if (dtype == paddle::framework::proto::VarType::BF16) {
+      output_type[name] = paddle_infer::DataType::BFLOAT16;
     } else if (dtype == paddle::framework::proto::VarType::INT64) {
       output_type[name] = paddle_infer::DataType::INT64;
     } else if (dtype == paddle::framework::proto::VarType::INT32) {

diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -31,14 +31,18 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #include <gtest/gtest_prod.h>
 #endif
 
 namespace paddle_infer {
+using float16 = paddle::platform::float16;
+using bfloat16 = phi::dtype::bfloat16;
 namespace experimental {
 class InternalUtils;
 };

diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
@@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
   switch (dtype) {
     case PaddleDType::FLOAT32:
       return sizeof(float);
+    case PaddleDType::BFLOAT16:
+      return sizeof(uint16_t);
     case PaddleDType::INT64:
       return sizeof(int64_t);
     case PaddleDType::INT32:

diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
@@ -223,6 +223,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       input_ptr = input.mutable_data<float>(ddim, place_);
     } else if (inputs[i].dtype == PaddleDType::INT32) {
       input_ptr = input.mutable_data<int32_t>(ddim, place_);
+    } else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
+      input_ptr = input.mutable_data<bfloat16>(ddim, place_);
     } else {
       LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
       return false;

diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -31,6 +31,7 @@
 namespace paddle_infer {
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 void Tensor::Reshape(const std::vector<int> &shape) {
 #ifdef PADDLE_WITH_ONNXRUNTIME
@@ -178,6 +179,8 @@ DataType Tensor::type() const {
     return DataType::FLOAT32;
   } else if (type == paddle::framework::proto::VarType::FP16) {
     return DataType::FLOAT16;
+  } else if (type == paddle::framework::proto::VarType::BF16) {
+    return DataType::BFLOAT16;
   } else if (type == paddle::framework::proto::VarType::INT64) {
     return DataType::INT64;
   } else if (type == paddle::framework::proto::VarType::INT32) {
@@ -289,6 +292,11 @@ struct DataTypeInfo<float16> {
   phi::DataType TYPE = phi::DataType::FLOAT16;
 };
 
+template <>
+struct DataTypeInfo<bfloat16> {
+  phi::DataType TYPE = phi::DataType::BFLOAT16;
+};
+
 template <>
 struct DataTypeInfo<int64_t> {
   phi::DataType TYPE = phi::DataType::INT64;
@@ -500,6 +508,7 @@ template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
+template PD_INFER_DECL void Tensor::CopyFromCpu<bfloat16>(const bfloat16 *data);
 template PD_INFER_DECL void Tensor::CopyFromCpu<bool>(const bool *data);
 
 template PD_INFER_DECL void Tensor::ShareExternalData<double>(
@@ -537,6 +546,11 @@ template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
     const std::vector<int> &shape,
     PlaceType place,
     DataLayout layout);
+template PD_INFER_DECL void Tensor::ShareExternalData<bfloat16>(
+    const bfloat16 *data,
+    const std::vector<int> &shape,
+    PlaceType place,
+    DataLayout layout);
 template PD_INFER_DECL void Tensor::ShareExternalData<bool>(
     const bool *data,
     const std::vector<int> &shape,
@@ -550,6 +564,7 @@ template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
+template PD_INFER_DECL void Tensor::CopyToCpu<bfloat16>(bfloat16 *data) const;
 template PD_INFER_DECL void Tensor::CopyToCpu<bool>(bool *data) const;
 
 template PD_INFER_DECL void Tensor::CopyToCpuImpl<double>(
@@ -568,6 +583,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuImpl<int8_t>(
     int8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
 template PD_INFER_DECL void Tensor::CopyToCpuImpl<float16>(
     float16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuImpl<bfloat16>(
+    bfloat16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
 template PD_INFER_DECL void Tensor::CopyToCpuImpl<bool>(bool *data,
                                                         void *exec_stream,
                                                         CallbackFunc cb,
@@ -587,6 +604,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
     int8_t *data, void *exec_stream) const;
 template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
     float16 *data, void *exec_stream) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
+    bfloat16 *data, void *exec_stream) const;
 template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(
     bool *data, void *exec_stream) const;
 
@@ -604,6 +623,8 @@ template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
     int8_t *data, CallbackFunc cb, void *cb_params) const;
 template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
     float16 *data, CallbackFunc cb, void *cb_params) const;
+template PD_INFER_DECL void Tensor::CopyToCpuAsync<bfloat16>(
+    bfloat16 *data, CallbackFunc cb, void *cb_params) const;
 template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(bool *data,
                                                          CallbackFunc cb,
                                                          void *cb_params) const;
@@ -622,6 +643,8 @@ template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
                                                     int *size) const;
 template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
                                                       int *size) const;
+template PD_INFER_DECL bfloat16 *Tensor::data<bfloat16>(PlaceType *place,
+                                                        int *size) const;
 template PD_INFER_DECL bool *Tensor::data<bool>(PlaceType *place,
                                                 int *size) const;
 
@@ -632,6 +655,8 @@ template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
 template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
+template PD_INFER_DECL bfloat16 *Tensor::mutable_data<bfloat16>(
+    PlaceType place);
 template PD_INFER_DECL bool *Tensor::mutable_data<bool>(PlaceType place);
 
 Tensor::Tensor(void *scope, const void *device_contexts)
@@ -783,6 +808,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
 template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
 template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
 template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
 #endif
 
 namespace experimental {
@@ -921,6 +947,8 @@ template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
     paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
 template void InternalUtils::CopyFromCpuWithIoStream<float16>(
     paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<bfloat16>(
+    paddle_infer::Tensor *t, const bfloat16 *data, cudaStream_t stream);
 template void InternalUtils::CopyFromCpuWithIoStream<bool>(
     paddle_infer::Tensor *t, const bool *data, cudaStream_t stream);
 
@@ -938,6 +966,8 @@ template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
     paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
 template void InternalUtils::CopyToCpuWithIoStream<float16>(
     paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<bfloat16>(
+    paddle_infer::Tensor *t, bfloat16 *data, cudaStream_t stream);
 template void InternalUtils::CopyToCpuWithIoStream<bool>(
     paddle_infer::Tensor *t, bool *data, cudaStream_t stream);
 

diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -108,6 +108,13 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
             cb,
             cb_params);
         break;
+      case PaddleDType::BFLOAT16:
+        src.CopyToCpuImpl(
+            dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kCPU),
+            exec_stream,
+            cb,
+            cb_params);
+        break;
       default:
         PADDLE_THROW(paddle::platform::errors::Unimplemented(
             "Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "
@@ -172,6 +179,13 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
             src.data<paddle::platform::float16>(&src_place, &data_size));
         data_len = data_size * 2;
         break;
+      case PaddleDType::BFLOAT16:
+        dst_data = static_cast<void*>(
+            dst.mutable_data<paddle::platform::bfloat16>(PlaceType::kGPU));
+        src_data = static_cast<void*>(
+            src.data<paddle::platform::bfloat16>(&src_place, &data_size));
+        data_len = data_size * 2;
+        break;
       default:
         PADDLE_THROW(paddle::platform::errors::Unimplemented(
             "Only INT32, INT64, UINT8, INT8, BOOL, FLOAT16, FLOAT32 and "

diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
@@ -62,6 +62,7 @@ enum DataType {
   FLOAT16,
   BOOL,
   FLOAT64,
+  BFLOAT16,
   // TODO(Inference): support more data types if needed.
 };