diff --git a/.vscode/settings.json b/.vscode/settings.json index 6f535da99..354d015db 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -67,6 +67,22 @@ "unordered_set": "cpp", "future": "cpp", "cfenv": "cpp", - "typeindex": "cpp" + "typeindex": "cpp", + "variant": "cpp", + "__bit_reference": "cpp", + "__config": "cpp", + "__debug": "cpp", + "__errc": "cpp", + "__hash_table": "cpp", + "__locale": "cpp", + "__mutex_base": "cpp", + "__node_handle": "cpp", + "__split_buffer": "cpp", + "__threading_support": "cpp", + "__tree": "cpp", + "__verbose_abort": "cpp", + "charconv": "cpp", + "ios": "cpp", + "locale": "cpp" } -} \ No newline at end of file +} diff --git a/examples/cpp/gpt/gpt_config.ini b/examples/cpp/gpt/gpt_config.ini index ebeb066ad..efbee1d44 100644 --- a/examples/cpp/gpt/gpt_config.ini +++ b/examples/cpp/gpt/gpt_config.ini @@ -3,7 +3,7 @@ max_batch_size=8 ; Use for allocate the buffer max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter beam_width=1 ; beam width for beam search top_k=0 ; k value for top k sampling -top_p=0.5 ; p value for top p sampling +top_p=0 ; p value for top p sampling temperature=1.0 ; Use for sampling repetition_penalty=2.0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. @@ -17,12 +17,12 @@ model_name=gpt_124M ; model_name=gpt_175B ; model_name=self_defined ; model_dir=./models/megatron-models/c-model/6.7b/ -model_dir=models/openai-gpt-models/c-model/124m/1-gpu/ +model_dir=/notebooks/ft_gpt2/1-gpu/ shared_contexts_ratio=1.0 [request] -request_batch_size=8 ; determine by the request -request_output_len=32 ; determine by the request +request_batch_size=2 ; determine by the request +request_output_len=50 ; determine by the request return_log_probs=false ; return the output log probs and cumulative log probs. context_log_probs=false ; include input contexts in the cumulative log probability computation. diff --git a/examples/cpp/gpt/gpt_example.cc b/examples/cpp/gpt/gpt_example.cc index 2f4b29799..b1540ea13 100644 --- a/examples/cpp/gpt/gpt_example.cc +++ b/examples/cpp/gpt/gpt_example.cc @@ -343,20 +343,20 @@ void gpt_example(const INIReader reader) input_tensors.insert( {"presence_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &presence_penalty}}); } - if (top_k == 0 && top_p == 0.0f) { - FT_CHECK(beam_width > 1); - input_tensors.insert({"beam_search_diversity_rate", - Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); - } - else { - input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); - if (top_p != 0.0f) { - input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); - } - if (top_k != 0) { - input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); - } - } + // if (top_k == 0 && top_p == 0.0f) { + // FT_CHECK(beam_width > 1); + // input_tensors.insert({"beam_search_diversity_rate", + // Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); + // } + // else { + // input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + // if (top_p != 0.0f) { + // input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); + // } + // if (top_k != 0) { + // input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); + // } + // } std::unordered_map output_tensors = std::unordered_map{ {"output_ids", @@ -439,9 +439,16 @@ void gpt_example(const INIReader reader) size_t outCount = total_output_len * request_batch_size * beam_width; int* hBuf = new int[outCount]; cudaD2Hcpy(hBuf, d_output_ids, outCount); + size_t seqLenCount = request_batch_size * beam_width; + int* hBuf2 = new int[seqLenCount]; + cudaD2Hcpy(hBuf2, d_sequence_lengths, seqLenCount); { std::cout << "Writing " << outCount << " elements\n"; + for (int i=0; i{request_batch_size}, prefix_prompt_task_ids.data()}}); } - if (top_k == 0 && top_p == 0.0f) { - FT_CHECK(beam_width > 1); - input_tensors.insert({"beam_search_diversity_rate", - Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); - } - else { - input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); - if (top_p != 0.0f) { - input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); - } - if (top_k != 0) { - input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); - } - } + // if (top_k == 0 && top_p == 0.0f) { + // FT_CHECK(beam_width > 1); + // input_tensors.insert({"beam_search_diversity_rate", + // Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); + // } + // else { + // input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + // if (top_p != 0.0f) { + // input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); + // } + // if (top_k != 0) { + // input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); + // } + // } std::unordered_map output_tensors = std::unordered_map{ {"output_ids", @@ -456,13 +456,13 @@ void llama_example(const INIReader reader) size_t seqLCount = request_batch_size * beam_width; int* seqlBuf = new int[seqLCount]; - size_t inLCount = request_batch_size * beam_width; - int* inlBuf = new int[inLCount]; - cudaD2Hcpy(hBuf, d_output_ids, outCount); cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount); - cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount); - printf("seqlBuf: %d\n", seqlBuf[0]); + printf("seq len: "); + for (int i=0; i tmp_vec; while (std::getline(lineStream, vals, ',')) { - printf("vals: %s\n", vals.c_str()); tmp_vec.push_back(std::stoi(vals)); i1++; } @@ -89,7 +88,7 @@ int read_start_ids(size_t batch_size, for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) { v_start_ids->push_back(tmp_start_ids[i][j]); } - printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]); + // printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]); v_start_lengths->push_back(tmp_start_lengths[i]); } } diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu index 040c1bcff..7e7b13b4c 100644 --- a/src/fastertransformer/kernels/decoding_kernels.cu +++ b/src/fastertransformer/kernels/decoding_kernels.cu @@ -527,6 +527,8 @@ __global__ void gatherTree(gatherTreeParam param) param.max_sequence_lengths[batch * param.beam_width + j] = tmp_len - param.max_prefix_soft_prompt_length - (param.max_input_length - param.max_input_without_prompt_length); + printf("a: %d b: %d\n", param.max_sequence_lengths[batch * param.beam_width + j], tmp_len); + printf("%d %d\n", param.max_input_length, param.max_input_without_prompt_length); // update the response input length if (update_response_input_length) { param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len; diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu index 7dc9af620..913f9ab1a 100644 --- a/src/fastertransformer/kernels/gpt_kernels.cu +++ b/src/fastertransformer/kernels/gpt_kernels.cu @@ -568,6 +568,32 @@ void invokeTileGptInputs(int* tiled_input_ids, stream); } +__global__ void calculateNewTokenLength(int* output_lengths, + const int* input_lengths, + const int max_input_length, + const int batch_size, + const int beam_width) +{ + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width; + index += blockDim.x * gridDim.x) { + output_lengths[index] -= max_input_length - input_lengths[index / beam_width]; + } +} + +void invokeCalculateNewTokenLength(int* output_lengths, + const int* input_lengths, + const int max_input_length, + const int batch_size, + const int beam_width, + cudaStream_t stream) { + dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256)); + dim3 block(256); + + calculateNewTokenLength<<>>( + output_lengths, input_lengths, max_input_length, batch_size, beam_width); +} + + void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size) { std::vector seq_len_h(batch_size); diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h index d78224e0a..74532f23b 100644 --- a/src/fastertransformer/kernels/gpt_kernels.h +++ b/src/fastertransformer/kernels/gpt_kernels.h @@ -121,6 +121,13 @@ void invokeTileGptInputs(int* tiled_input_ids, const int max_input_length, cudaStream_t stream); +void invokeCalculateNewTokenLength(int* output_lengths, + const int* input_lengths, + const int max_input_length, + const int batch_size, + const int beam_width, + cudaStream_t stream); + void invokeFindContextDups(int* shared_contexts, int* batch_to_compact, int* compact_to_batch, diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index 21df52687..68852dde0 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -225,6 +225,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, const int tid = threadIdx.x; const int batch_id = blockIdx.x; if (skip_decode != nullptr && skip_decode[batch_id]) { + printf("skip decode\n"); return; } @@ -245,8 +246,9 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, s_sum = 0.0f; } TopK_2 partial; - + // printf("end id: %d\n", end_ids[batch_id]); if (finished != nullptr && finished[batch_id] == true) { + printf("batch id: %d", batch_id); ids[batch_id] = end_ids[batch_id]; return; } @@ -307,6 +309,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, if (sequence_length != nullptr && finished != nullptr) { sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1; finished[batch_id] = ids[batch_id] == end_ids[batch_id] ? true : false; + printf("batch %d: %d %d %d\n", batch_id, finished[batch_id], ids[batch_id], end_ids[batch_id]); } } } diff --git a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu index d0092e279..9f2c7155b 100644 --- a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu +++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu @@ -244,7 +244,7 @@ void TopKSamplingLayer::runSampling(TensorMap* output_tensors, TensorMap* inp stream_); sync_check_cuda_error(); } - + // printf("TopKSamplingLayer::runSampling\n"); invokeBatchTopKSampling( sampling_workspace_, sampling_workspace_size_, diff --git a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu index 2fa3e6b0f..2f61a12bb 100644 --- a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu +++ b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu @@ -278,6 +278,7 @@ void TopPSamplingLayer::runSampling(TensorMap* output_tensors, TensorMap* inp FT_CHECK(input_tensors->size() >= 4); FT_CHECK(output_tensors->size() >= 1); + printf("TopPSamplingLayer::runSampling\n"); const int batch_size = output_tensors->at("output_ids").shape[1]; const int local_batch_size = input_tensors->at("logits").shape[0]; const int step = input_tensors->at("step").getVal(); diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 82630079d..cbe59a91f 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1066,6 +1066,37 @@ void Llama::forward(std::unordered_map* output_ten } dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); + { + int* buf; + bool* finish; + int* id_buf; + int seq_len = batch_size * beam_width; + int st = seq_len; + int st2 = max_seq_len * batch_size * beam_width; + buf = new int[st]; + id_buf = new int[st2]; + finish = new bool[st]; + cudaMemcpy(buf, sequence_lengths_, sizeof(int) * st, cudaMemcpyDeviceToHost); + cudaMemcpy(id_buf, output_ids_buf_, sizeof(int) * st2, cudaMemcpyDeviceToHost); + cudaMemcpy(finish, finished_buf_, sizeof(bool) * st, cudaMemcpyDeviceToHost); + + printf("seq_len at step: %d\n", step); + for (int i=0; i < seq_len; i++) { + printf("%d ", buf[i]); + } + printf("\n"); + for (int i=0; i < seq_len; i++) { + printf("%d ", finish[i]); + } + printf("\n"); + printf("ids: \n"); + for (int i=0; i < batch_size; i++) { + for (int j=0; j::setOutputTensors(std::unordered_map* o const size_t batch_size = output_tensors->at("output_ids").shape[0]; const size_t beam_width = output_tensors->at("output_ids").shape[1]; int* sequence_lengths = output_tensors->at("sequence_length").getPtr(); + int* input_lengths = input_tensors->at("input_lengths").getPtr(); const size_t max_prefix_soft_prompt_length = has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; @@ -1253,6 +1285,14 @@ void Llama::setOutputTensors(std::unordered_map* o "The shape of cum_log_probs does not match with batch_size x beam_width."); cudaAutoCpy(cum_log_probs.getPtr(), cum_log_probs_, cum_log_probs.size(), stream_); } + printf("max_input_length: %d\n", max_input_length); + invokeCalculateNewTokenLength(sequence_lengths, + input_lengths, + max_input_length, + batch_size, + beam_width, + stream_); + sync_check_cuda_error(); } template