Skip to content

Commit

Permalink
pull from laipaang/qingshui-2.4.2 (PaddlePaddle#108)
Browse files Browse the repository at this point in the history
* trie by custname, aigc-model-42

* one stage topk, aigc-model-74

* copy from cpu paddle tensor & infer run release GIL, aigc-model-79

* reduced batch topk, aigc-model-85

---------

Co-authored-by: wanglipeng <wanglipeng@baidu.com>
  • Loading branch information
laipaang and wanglipeng authored Jan 16, 2024
1 parent 8888367 commit a7e3f45
Show file tree
Hide file tree
Showing 14 changed files with 841 additions and 112 deletions.
6 changes: 3 additions & 3 deletions paddle/fluid/framework/trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct File {

struct Node {
uint32_t id = 0;
uint16_t label = 0;
uint32_t label = 0;
std::vector<uint32_t> child;
uint8_t aleaf = 0;
};
Expand All @@ -74,7 +74,7 @@ struct Node {
virtual ~Trie() {}
int load(const std::string& dir, const uint32_t thr_num=20u);

uint16_t label(uint32_t id) {
uint32_t label(uint32_t id) {
return label_.at(id);
}

Expand Down Expand Up @@ -157,7 +157,7 @@ struct Node {
void load_file(uint32_t thr_id, File& file);
void stat_file(uint32_t thr_id, File& file);

std::vector<uint16_t> label_;
std::vector<uint32_t> label_;
std::vector<uint8_t> aleaf_;
std::vector<uint32_t> child_mem_;
std::vector<uint32_t> mem_off_;
Expand Down
43 changes: 41 additions & 2 deletions paddle/fluid/framework/trie_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,45 @@ namespace paddle {
namespace framework {
std::shared_ptr<TrieManager> TrieManager::_s_instance = nullptr;

void TrieManager::reset(const std::vector<int>& labels) {
VLOG(3) << "trie reset...";
std::unique_lock<std::mutex> lock(mtx_);

size_t root = 0;
size_t chs = trie_.child_size(root);
std::unordered_map<uint32_t, uint32_t> l2n;
for (size_t i = 0; i < chs; ++i) {
uint32_t cid = trie_.child_at(root, i);
uint32_t lab = trie_.label(cid);
l2n.insert({lab, cid});
}

parent_idx_.mutable_data<int64_t>({int(labels.size())}, phi::GPUPinnedPlace());
int64_t* parent_idx = parent_idx_.data<int64_t>();

select_ids_.mutable_data<int64_t>({int(labels.size())}, phi::GPUPinnedPlace());
int64_t* select_ids = select_ids_.data<int64_t>();

label2node_.resize(labels.size());
for (size_t i = 0; i < labels.size(); ++i) {
auto it = l2n.find(labels[i]);
uint32_t label = endid_;
uint32_t nodeid = end_nodeid_;

if (it != l2n.end()) {
label = labels[i];
nodeid = it->second;
}

parent_idx[i] = i;
select_ids[i] = label;
label2node_[i].insert({label, nodeid});
}

phase_ = Phase::run;
cv_.notify_one();
}

void TrieManager::reset() {
VLOG(3) << "trie reset...";
std::unique_lock<std::mutex> lock(mtx_);
Expand Down Expand Up @@ -84,8 +123,8 @@ void TrieManager::run() {
int64_t* parent_idx = parent_idx_.data<int64_t>();
int64_t* select_ids = select_ids_.data<int64_t>();

std::vector<std::unordered_map<uint16_t, uint32_t>> label2node(numel);
std::vector<std::vector<uint16_t>> outs(numel);
std::vector<std::unordered_map<uint32_t, uint32_t>> label2node(numel);
std::vector<std::vector<uint32_t>> outs(numel);
parallel_run_range(numel, thr_num, [this, parent_idx, select_ids, &outs, &label2node] (
uint32_t thr_id, uint32_t start, uint32_t end) {
for (size_t i = start; i < end; ++i) {
Expand Down
7 changes: 4 additions & 3 deletions paddle/fluid/framework/trie_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ enum class Phase {
};

public:
TrieManager(uint16_t endid) : endid_(endid),
TrieManager(uint32_t endid) : endid_(endid),
place_(platform::GetCurrentDeviceId()) {
thread_ = std::thread(&TrieManager::run, this);
}
Expand All @@ -94,7 +94,7 @@ enum class Phase {
return _s_instance;
}

static std::shared_ptr<TrieManager> SetInstance(uint16_t endid) {
static std::shared_ptr<TrieManager> SetInstance(uint32_t endid) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (nullptr == _s_instance) {
Expand All @@ -111,6 +111,7 @@ enum class Phase {
return trie_.load(dir, thr_num);
}
void reset();
void reset(const std::vector<int>& labels);
void search_start(const Tensor* d_parent, const Tensor* d_select);
void search_wait();

Expand All @@ -124,7 +125,7 @@ enum class Phase {
// cpu
Tensor parent_idx_;
Tensor select_ids_;
std::vector<std::unordered_map<uint16_t, uint32_t>> label2node_;
std::vector<std::unordered_map<uint32_t, uint32_t>> label2node_;

// cpu
Tensor next_out_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
qktv_out.Resize({{bsz, num_head, seq_len, dim_head}});
auto *qktv_out_data =
dev_ctx.Alloc<T>(&qktv_out, qktv_out.numel() * sizeof(T));
fmha_out.Resize({{token_num, num_head, dim_head}});
fmha_out.Resize({{bsz, seq_len, num_head, dim_head}});
auto *fmha_out_data =
dev_ctx.Alloc<T>(&fmha_out, fmha_out.numel() * sizeof(T));

Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/pybind/box_helper_py.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,10 @@ void BindTrieManager(py::module* m) {
py::arg("thr_num")=20u,
py::call_guard<py::gil_scoped_release>())
.def("reset",
&framework::TrieManager::reset,
py::overload_cast<>(&framework::TrieManager::reset),
py::call_guard<py::gil_scoped_release>())
.def("reset",
py::overload_cast<const std::vector<int>&>(&framework::TrieManager::reset),
py::call_guard<py::gil_scoped_release>());
} // end TrieManager

Expand Down
49 changes: 42 additions & 7 deletions paddle/fluid/pybind/inference_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,35 @@ void PaddleInferTensorCreate(
tensor.CopyFromCpu(static_cast<const T *>(data.data()));
}

void CopyFromCpuPaddleTensor(paddle_infer::Tensor &tensor,
paddle::experimental::Tensor &&paddle_tensor) {
std::vector<int> shape;
for (int i = 0; i < paddle_tensor.dims().size(); ++i) {
shape.push_back(paddle_tensor.dims()[i]);
}
tensor.Reshape(std::move(shape));

switch (paddle_tensor.dtype()) {
case paddle::experimental::DataType::FLOAT16:
tensor.CopyFromCpu(static_cast<const paddle::platform::float16 *>(
paddle_tensor.data<paddle::platform::float16>()));
break;
case paddle::experimental::DataType::FLOAT32:
tensor.CopyFromCpu(static_cast<const float *>(paddle_tensor.data<float>()));
break;
case paddle::experimental::DataType::INT32:
tensor.CopyFromCpu(static_cast<const int32_t *>(paddle_tensor.data<int32_t>()));
break;
case paddle::experimental::DataType::INT64:
tensor.CopyFromCpu(static_cast<const int64_t *>(paddle_tensor.data<int64_t>()));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported data type. Now copy_from_cpu only supports FLOAT16, FLOAT32, "
"INT32, and INT64."));
}
}

paddle_infer::PlaceType ToPaddleInferPlace(
phi::AllocationType allocation_type) {
if (allocation_type == phi::AllocationType::CPU) {
Expand Down Expand Up @@ -585,7 +614,8 @@ void BindPaddlePredictor(py::module *m) {
std::vector<PaddleTensor> outputs;
self.Run(inputs, &outputs);
return outputs;
})
},
py::call_guard<py::gil_scoped_release>())
.def("get_input_tensor", &PaddlePredictor::GetInputTensor)
.def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
.def("get_input_names", &PaddlePredictor::GetInputNames)
Expand Down Expand Up @@ -634,7 +664,8 @@ void BindNativePredictor(py::module *m) {
std::vector<PaddleTensor> outputs;
self.Run(inputs, &outputs);
return outputs;
})
},
py::call_guard<py::gil_scoped_release>())
.def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
.def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
.def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
Expand Down Expand Up @@ -926,7 +957,8 @@ void BindAnalysisPredictor(py::module *m) {
std::vector<PaddleTensor> outputs;
self.Run(inputs, &outputs);
return outputs;
})
},
py::call_guard<py::gil_scoped_release>())
.def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
.def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
.def("get_input_names", &AnalysisPredictor::GetInputNames)
Expand Down Expand Up @@ -972,11 +1004,9 @@ void BindPaddleInferPredictor(py::module *m) {
.def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
.def("run",
[](paddle_infer::Predictor &self) {
#ifdef PADDLE_WITH_ASCEND_CL
pybind11::gil_scoped_release release;
#endif
self.Run();
})
},
py::call_guard<py::gil_scoped_release>())
.def("clone",
[](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down Expand Up @@ -1024,6 +1054,11 @@ void BindPaddleInferTensor(py::module *m) {
.def("copy_from_cpu_bind",
&PaddleInferTensorCreate<paddle_infer::float16>)
.def("copy_from_cpu_bind", &PaddleInferStringTensorCreate)
.def("_copy_from_cpu_bind",
[](paddle_infer::Tensor &self, const py::handle &input) {
PyObject *obj = input.ptr();
CopyFromCpuPaddleTensor(self, std::move(CastPyArg2Tensor(obj, 0)));
})
.def("share_external_data_bind", &PaddleInferShareExternalData)
.def("_share_external_data_paddle_tensor_bind",
[](paddle_infer::Tensor &self, const py::handle &input) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/api/yaml/ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@
backward : flip_grad

- op : beam_search_softmax
args : (Tensor logits, Tensor cum_scores, Tensor sequence_lengths, Tensor stop_flags, Tensor end_ids, Tensor step_ids, Tensor last_cache_ids, Tensor last_beam_offsets, int beam_size, int max_seq_len, int max_dec_len, bool fuse_softmax, bool early_stop, float length_penalty=0.0)
args : (Tensor logits, Tensor cum_scores, Tensor sequence_lengths, Tensor stop_flags, Tensor end_ids, Tensor step_ids, Tensor last_cache_ids, Tensor last_beam_offsets, int beam_size, int max_seq_len, int max_dec_len, bool fuse_softmax, bool early_stop, float length_penalty=0.0, bool one_stage_topk=false)
output : Tensor(ids_this_time), Tensor(out_cum_scores), Tensor(cache_ids), Tensor(beam_offsets), Tensor(parent_idx), Tensor(stop_flags_out), Tensor(seq_lens_out), Tensor(step_ids_out)
infer_meta :
func : BeamSearchSoftmaxInferMeta
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/infermeta/multiary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ void BeamSearchSoftmaxInferMeta(const MetaTensor& logits,
bool fuse_softmax,
bool early_stop,
float length_penalty,
bool one_stage_topk,
MetaTensor* ids_this_time,
MetaTensor* out_cum_scores,
MetaTensor* cache_ids,
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/infermeta/multiary.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ void BeamSearchSoftmaxInferMeta(const MetaTensor& logits,
bool fuse_softmax,
bool early_stop,
float length_penalty,
bool one_stage_topk,
MetaTensor* ids_this_time,
MetaTensor* out_cum_scores,
MetaTensor* cache_ids,
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/kernels/fusion/beam_search_softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ void BeamSearchSoftmaxKernel(const Context &dev_ctx,
bool fuse_softmax,
bool early_stop,
float length_penalty,
bool one_stage_topk,
DenseTensor *ids_this_time,
DenseTensor *out_cum_scores,
DenseTensor *cache_ids,
Expand Down
Loading

0 comments on commit a7e3f45

Please sign in to comment.