Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…nto develop
  • Loading branch information
DesmonDay committed Jan 13, 2025
2 parents 042179d + 027b530 commit 7c3d9d5
Show file tree
Hide file tree
Showing 55 changed files with 773 additions and 882 deletions.
27 changes: 14 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
| [ChatGLM2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/chatglm2) | THUDM/chatglm2-6b |
| [ChatGLM3](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/chatglm2) | THUDM/chatglm3-6b |
| [DeepSeekV2](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/config/deepseek-v2) | deepseek-ai/DeepSeek-V2, deepseek-ai/DeepSeek-V2-Chat, deepseek-ai/DeepSeek-V2-Lite, deepseek-ai/DeepSeek-V2-Lite-Chat, deepseek-ai/DeepSeek-Coder-V2-Base, deepseek-ai/DeepSeek-Coder-V2-Instruct, deepseek-ai/DeepSeek-Coder-V2-Lite-Base, deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct |
| [DeepSeekV3](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/config/deepseek-v2) | deepseek-ai/DeepSeek-V3, deepseek-ai/DeepSeek-V3-Base |
| [Gemma](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/gemma) | google/gemma-7b, google/gemma-7b-it, google/gemma-2b, google/gemma-2b-it |
| [Mistral](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/mistral) | mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-7B-v0.1 |
| [Mixtral](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/mixtral) | mistralai/Mixtral-8x7B-Instruct-v0.1 |
Expand Down Expand Up @@ -130,19 +131,19 @@


| Model | Pretrain | SFT | LoRA | FlashMask | Prefix Tuning | DPO/SimPO/ORPO/KTO | RLHF | Mergekit | Quantization |
|--------------------------------------------|:--------:|:---:|:----:|:---------:|:-------------:|:--------------:|:----:|:-----:|:------------:|
| [Llama](./llm/config/llama) |||||||| ||
| [Qwen](./llm/config/qwen) ||||||| 🚧 | | 🚧 |
| [Mixtral](./llm/config/mixtral) |||| 🚧 | 🚧 || 🚧 | | 🚧 |
| [Mistral](./llm/config/mistral) |||| 🚧 ||| 🚧 | | 🚧 |
| [Baichuan/Baichuan2](./llm/config/llama) ||||||| 🚧 | ||
| [ChatGLM-6B](./llm/config/chatglm) |||| 🚧 || 🚧 | 🚧 | ||
| [ChatGLM2/ChatGLM3](./llm/config/chatglm2) |||| 🚧 ||| 🚧 | ||
| [Bloom](./llm/config/bloom) |||| 🚧 || 🚧 | 🚧 | ||
| [GPT-3](./llm/config/gpt-3) ||| 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | | 🚧 |
| [OPT](./llm/config/opt) |||| 🚧 | 🚧 | 🚧 | 🚧 | | 🚧 |
| [Gemma](./llm/config/gemma) |||| 🚧 | 🚧 || 🚧 | | 🚧 |
| [Yuan](./llm/config/yuan) |||| 🚧 | 🚧 || 🚧 | | 🚧 |
|--------------------------------------------|:--------:|:---:|:----:|:---------:|:-------------:|:------------------:|:----:|:--------:|:------------:|
| [Llama](./llm/config/llama) |||||| || ||
| [Qwen](./llm/config/qwen) |||||| | 🚧 | | 🚧 |
| [Mixtral](./llm/config/mixtral) |||| 🚧 | 🚧 | | 🚧 | | 🚧 |
| [Mistral](./llm/config/mistral) |||| 🚧 || | 🚧 | | 🚧 |
| [Baichuan/Baichuan2](./llm/config/llama) |||||| | 🚧 | ||
| [ChatGLM-6B](./llm/config/chatglm) |||| 🚧 || 🚧 | 🚧 | ||
| [ChatGLM2/ChatGLM3](./llm/config/chatglm2) |||| 🚧 || | 🚧 | ||
| [Bloom](./llm/config/bloom) |||| 🚧 || 🚧 | 🚧 | ||
| [GPT-3](./llm/config/gpt-3) ||| 🚧 | 🚧 | 🚧 | 🚧 | 🚧 | | 🚧 |
| [OPT](./llm/config/opt) |||| 🚧 | 🚧 | 🚧 | 🚧 | | 🚧 |
| [Gemma](./llm/config/gemma) |||| 🚧 | 🚧 | | 🚧 | | 🚧 |
| [Yuan](./llm/config/yuan) |||| 🚧 | 🚧 | | 🚧 | | 🚧 |
* [大模型推理](./llm/docs/predict/inference.md)已支持 LLaMA 系列、Qwen 系列、Mistral 系列、ChatGLM 系列、Bloom 系列和 Baichuan 系列,支持 Weight Only INT8及 INT4推理,支持 WAC(权重、激活、Cache KV)进行 INT8、FP8量化的推理,【LLM】模型推理支持列表如下:

| 模型名称/量化类型支持 | FP16/BF16 | WINT8 | WINT4 | INT8-A8W8 | FP8-A8W8 | INT8-A8W8C8 |
Expand Down
183 changes: 95 additions & 88 deletions csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,40 +102,21 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& max_enc_len_this_time,
const paddle::Tensor& max_dec_len_this_time,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& cum_offsets,
const int group_size,
const int block_size,
const int decoder_step_token_num) {
auto stream = seq_lens_encoder.stream();
paddle::Tensor encoder_batch_ids, encoder_tile_ids_per_batch, encoder_num_blocks_x_cpu,
kv_batch_ids, kv_tile_ids_per_batch, kv_num_blocks_x_cpu, decoder_batch_ids,
decoder_tile_ids_per_batch, decoder_num_blocks_x_cpu;
auto stream = seq_lens_this_time.stream();
int bsz = cum_offsets.shape()[0];
const int encoder_block_shape_q = get_encoder_block_shape_q();
const int decoder_block_shape_q = get_decoder_block_shape_q();

// decoder
const uint32_t decoder_max_tile_size_per_bs_q =
div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
auto decoder_batch_ids =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto decoder_tile_ids_per_batch =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto decoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(),
bsz,
decoder_block_shape_q,
group_size);
auto decoder_num_blocks_x_cpu =
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);

// max_len
auto max_len_kv =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
Expand All @@ -147,77 +128,100 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
auto max_len_kv_cpu =
max_len_kv.copy_to(paddle::CPUPlace(), false);

// decoder
int max_dec_len_this_time_data = max_dec_len_this_time.data<int>()[0];
if (max_dec_len_this_time_data > 0) {
const uint32_t decoder_max_tile_size_per_bs_q =
div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
decoder_batch_ids =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
decoder_tile_ids_per_batch =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto decoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(),
bsz,
decoder_block_shape_q,
group_size);
decoder_num_blocks_x_cpu =
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
} else {
decoder_batch_ids =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::GPUPlace());
decoder_tile_ids_per_batch =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::GPUPlace());
decoder_num_blocks_x_cpu =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::CPUPlace());
}

// encoder
int max_enc_len_this_time_data = max_enc_len_this_time.data<int>()[0];
if (max_enc_len_this_time_data <= 0) {
auto encoder_batch_ids =
if (max_enc_len_this_time_data > 0) {
const uint32_t encoder_max_tile_size_per_bs_q = div_up(
(max_enc_len_this_time_data * group_size), encoder_block_shape_q);
encoder_batch_ids =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
encoder_tile_ids_per_batch =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto encoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
nullptr,
encoder_batch_ids.data<int>(),
encoder_tile_ids_per_batch.data<int>(),
encoder_num_blocks_x.data<int>(),
bsz,
encoder_block_shape_q,
group_size);
encoder_num_blocks_x_cpu =
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);

// kv
const uint32_t max_tile_size_per_bs_kv =
div_up(max_enc_len_this_time_data, block_size);
kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
paddle::DataType::INT32,
seq_lens_encoder.place());
kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto kv_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_kv_block<<<1, 32, 0, stream>>>(seq_lens_decoder.data<int>(),
seq_lens_encoder.data<int>(),
kv_batch_ids.data<int>(),
kv_tile_ids_per_batch.data<int>(),
kv_num_blocks_x.data<int>(),
bsz,
block_size,
block_size);
kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
} else {
encoder_batch_ids =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::GPUPlace());
auto encoder_tile_ids_per_batch =
encoder_tile_ids_per_batch =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::GPUPlace());
auto encoder_num_blocks_x_cpu =
encoder_num_blocks_x_cpu =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::CPUPlace());
auto kv_batch_ids =
kv_batch_ids =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::GPUPlace());
auto kv_tile_ids_per_batch =
kv_tile_ids_per_batch =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::GPUPlace());
auto kv_num_blocks_x_cpu =
kv_num_blocks_x_cpu =
paddle::full({1}, -1, paddle::DataType::INT32, paddle::CPUPlace());

return {encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks_x_cpu, /*cpu*/
kv_batch_ids,
kv_tile_ids_per_batch,
kv_num_blocks_x_cpu, /*cpu*/
decoder_batch_ids,
decoder_tile_ids_per_batch,
decoder_num_blocks_x_cpu, /*cpu*/
max_len_kv_cpu /*cpu*/};
}

// encoder
const uint32_t encoder_max_tile_size_per_bs_q = div_up(
(max_enc_len_this_time_data * group_size), encoder_block_shape_q);
auto encoder_batch_ids =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto encoder_tile_ids_per_batch =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto encoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
nullptr,
encoder_batch_ids.data<int>(),
encoder_tile_ids_per_batch.data<int>(),
encoder_num_blocks_x.data<int>(),
bsz,
encoder_block_shape_q,
group_size);
auto encoder_num_blocks_x_cpu =
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);

// kv
const uint32_t max_tile_size_per_bs_kv =
div_up(max_enc_len_this_time_data, block_size);
auto kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
paddle::DataType::INT32,
seq_lens_encoder.place());
auto kv_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_kv_block<<<1, 32, 0, stream>>>(seq_lens_decoder.data<int>(),
seq_lens_encoder.data<int>(),
kv_batch_ids.data<int>(),
kv_tile_ids_per_batch.data<int>(),
kv_num_blocks_x.data<int>(),
bsz,
block_size,
block_size);
auto kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
return {encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks_x_cpu, /*cpu*/
Expand All @@ -234,6 +238,7 @@ std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
const paddle::DataType& seq_lens_encoder_dtype,
const paddle::DataType& seq_lens_decoder_dtype,
const paddle::DataType& max_enc_len_this_time_dtype,
const paddle::DataType& max_dec_len_this_time_dtype,
const paddle::DataType& seq_lens_this_time_dtype,
const paddle::DataType& cum_offsets_dtype) {
return {paddle::DataType::INT32,
Expand All @@ -252,6 +257,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
const std::vector<int64_t>& seq_lens_encoder_shape,
const std::vector<int64_t>& seq_lens_decoder_shape,
const std::vector<int64_t>& max_enc_len_this_time_shape,
const std::vector<int64_t>& max_dec_len_this_time_shape,
const std::vector<int64_t>& seq_lens_this_time_shape,
const std::vector<int64_t>& cum_offsets_shape) {
std::vector<int64_t> dynamic_shape = {-1};
Expand All @@ -272,6 +278,7 @@ PD_BUILD_OP(get_block_shape_and_split_kv_block)
.Inputs({"seq_lens_encoder",
"seq_lens_decoder",
"max_enc_len_this_time",
"max_dec_len_this_time",
"seq_lens_this_time",
"cum_offsets"})
.Outputs({"encoder_batch_ids",
Expand Down
Loading

0 comments on commit 7c3d9d5

Please sign in to comment.