From b04253c5cec8086d5334c69e261f01f68f61965e Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 11 Oct 2023 22:33:16 -0700 Subject: [PATCH 01/57] commit --- .vscode/settings.json | 5 ++-- src/fastertransformer/kernels/gpt_kernels.cu | 24 ++++++++++++++++++++ src/fastertransformer/kernels/gpt_kernels.h | 6 +++++ src/fastertransformer/models/llama/Llama.cc | 1 + 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6f535da99..82000232b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -67,6 +67,7 @@ "unordered_set": "cpp", "future": "cpp", "cfenv": "cpp", - "typeindex": "cpp" + "typeindex": "cpp", + "variant": "cpp" } -} \ No newline at end of file +} diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu index 7dc9af620..da5cc0bf0 100644 --- a/src/fastertransformer/kernels/gpt_kernels.cu +++ b/src/fastertransformer/kernels/gpt_kernels.cu @@ -568,6 +568,30 @@ void invokeTileGptInputs(int* tiled_input_ids, stream); } +__global__ void calculateNewTokenLength(int* output_lengths, + const int* input_lengths, + const int batch_size, + const int beam_width) +{ + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width; + index += blockDim.x * gridDim.x) { + output_lengths[index] -= input_lengths[index / beam_width]; + } +} + +void invokeCalculateNewTokenLength(int* output_lengths, + const int* input_lengths, + const int batch_size, + const int beam_width, + cudaStream_t stream) { + dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256)); + dim3 block(256); + + calculateNewTokenLength<<>>( + word_ids, force_bos_ids, batch_size, beam_width, step); +} + + void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size) { std::vector seq_len_h(batch_size); diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h index d78224e0a..9831fc687 100644 --- a/src/fastertransformer/kernels/gpt_kernels.h +++ b/src/fastertransformer/kernels/gpt_kernels.h @@ -121,6 +121,12 @@ void invokeTileGptInputs(int* tiled_input_ids, const int max_input_length, cudaStream_t stream); +void invokeCalculateNewTokenLength(int* output_lengths, + const int* input_lengths, + const int batch_size, + const int beam_width, + cudaStream_t stream); + void invokeFindContextDups(int* shared_contexts, int* batch_to_compact, int* compact_to_batch, diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 82630079d..13319e751 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1176,6 +1176,7 @@ void Llama::setOutputTensors(std::unordered_map* o const size_t batch_size = output_tensors->at("output_ids").shape[0]; const size_t beam_width = output_tensors->at("output_ids").shape[1]; int* sequence_lengths = output_tensors->at("sequence_length").getPtr(); + int* input_lengths = input_tensors->at("input_lengths").getPtr(); const size_t max_prefix_soft_prompt_length = has_prefix_soft_prompt_ ? input_tensors->at("request_prompt_embedding").shape[1] : 0; From 72199b3cd20dfe435efb04b554190dbbb054df35 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 11 Oct 2023 22:33:42 -0700 Subject: [PATCH 02/57] commit --- src/fastertransformer/kernels/gpt_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu index da5cc0bf0..b4982220e 100644 --- a/src/fastertransformer/kernels/gpt_kernels.cu +++ b/src/fastertransformer/kernels/gpt_kernels.cu @@ -588,7 +588,7 @@ void invokeCalculateNewTokenLength(int* output_lengths, dim3 block(256); calculateNewTokenLength<<>>( - word_ids, force_bos_ids, batch_size, beam_width, step); + output_lengths, input_lengths, batch_size, beam_width); } From 3446b201b3124cbd9f0a27b14afded8cb2f75d97 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 11 Oct 2023 22:38:31 -0700 Subject: [PATCH 03/57] commit --- examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc index fc24fd26f..5980e75ba 100644 --- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc +++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc @@ -45,7 +45,6 @@ int read_start_ids(size_t batch_size, int i1 = 0; std::vector tmp_vec; while (std::getline(lineStream, vals, ',')) { - printf("vals: %s\n", vals.c_str()); tmp_vec.push_back(std::stoi(vals)); i1++; } From fed0fd97972fc22e14ccae84ec596d76410b3d5a Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Wed, 11 Oct 2023 22:56:16 -0700 Subject: [PATCH 04/57] commit --- examples/cpp/llama/llama_example.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index ce761f75f..252835630 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -456,14 +456,9 @@ void llama_example(const INIReader reader) size_t seqLCount = request_batch_size * beam_width; int* seqlBuf = new int[seqLCount]; - size_t inLCount = request_batch_size * beam_width; - int* inlBuf = new int[inLCount]; - cudaD2Hcpy(hBuf, d_output_ids, outCount); cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount); - cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount); printf("seqlBuf: %d\n", seqlBuf[0]); - { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; @@ -485,6 +480,9 @@ void llama_example(const INIReader reader) } std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; } + for (int i=0; i Date: Wed, 11 Oct 2023 23:06:16 -0700 Subject: [PATCH 05/57] commit --- src/fastertransformer/kernels/decoding_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu index 040c1bcff..88e8151f2 100644 --- a/src/fastertransformer/kernels/decoding_kernels.cu +++ b/src/fastertransformer/kernels/decoding_kernels.cu @@ -39,6 +39,7 @@ __global__ void decodingInitialize(bool* finished, index += blockDim.x * gridDim.x) { finished[index] = false; sequence_length[index] = max_input_length; + printf("index %d length: %d\n", index, max_input_length); if (word_ids != nullptr) { word_ids[index] = sentence_ids[index / beam_width]; } From 28f736fe33c2e7c929cd01a14b558422f9df6fb3 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:44:06 -0700 Subject: [PATCH 06/57] commit --- examples/cpp/llama/llama_example.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 252835630..616dca009 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -93,6 +93,7 @@ void llama_example(const INIReader reader) const float layernorm_eps = reader.GetFloat(model_name, "layernorm_eps"); const int start_id = reader.GetInteger(model_name, "start_id"); const int end_id = reader.GetInteger(model_name, "end_id"); + printf("end_id: %d\n", end_id); const size_t hidden_units = head_num * size_per_head; const size_t inter_size = reader.GetInteger(model_name, "inter_size"); From 59d547cd2836e4d442d8ee88a0e4dbf169c5ef65 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:49:39 -0700 Subject: [PATCH 07/57] commit --- src/fastertransformer/models/llama/Llama.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 13319e751..3c5257591 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1066,6 +1066,18 @@ void Llama::forward(std::unordered_map* output_ten } dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); + { + int* buf; + int seq_len = 8; + int st = seq_len; + buf = new int[st]; + cudaMemcpy(buf, sequence_lengths_, sizeof(T) * st, cudaMemcpyDeviceToHost); + printf("seq_len at step: %d\n", step); + for (int i=0; i < seq_len; i++) { + printf("%d ", buf[i]) + } + printf("\n"); + } *generation_should_stop_ &= subbatch_should_stop; } } From 7f411bebba71e0c89e4572abef94658eb8f1b0a6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:50:02 -0700 Subject: [PATCH 08/57] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 3c5257591..28f3c8865 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1074,7 +1074,7 @@ void Llama::forward(std::unordered_map* output_ten cudaMemcpy(buf, sequence_lengths_, sizeof(T) * st, cudaMemcpyDeviceToHost); printf("seq_len at step: %d\n", step); for (int i=0; i < seq_len; i++) { - printf("%d ", buf[i]) + printf("%d ", buf[i]); } printf("\n"); } From 58f90cf42cfe572f4128dee3ab339534b5fbd4e4 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:51:50 -0700 Subject: [PATCH 09/57] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 28f3c8865..658ace570 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1071,7 +1071,7 @@ void Llama::forward(std::unordered_map* output_ten int seq_len = 8; int st = seq_len; buf = new int[st]; - cudaMemcpy(buf, sequence_lengths_, sizeof(T) * st, cudaMemcpyDeviceToHost); + cudaMemcpy(buf, sequence_lengths_, sizeof(int) * st, cudaMemcpyDeviceToHost); printf("seq_len at step: %d\n", step); for (int i=0; i < seq_len; i++) { printf("%d ", buf[i]); From 3c6db0238aa9435fc03907396e98461b8c1ef7f4 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:53:38 -0700 Subject: [PATCH 10/57] commit --- src/fastertransformer/models/llama/Llama.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 658ace570..20a3be67b 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1068,14 +1068,21 @@ void Llama::forward(std::unordered_map* output_ten dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); { int* buf; - int seq_len = 8; + bool* finish; + int seq_len = batch_size * beam_width; int st = seq_len; buf = new int[st]; + finish = new bool[st]; cudaMemcpy(buf, sequence_lengths_, sizeof(int) * st, cudaMemcpyDeviceToHost); + cudaMemcpy(finish, finished_buf_, sizeof(bool) * st, cudaMemcpyDeviceToHost); + printf("seq_len at step: %d\n", step); for (int i=0; i < seq_len; i++) { printf("%d ", buf[i]); } + for (int i=0; i < seq_len; i++) { + printf("%d ", finish[i]); + } printf("\n"); } *generation_should_stop_ &= subbatch_should_stop; From 8c13f1d29332c7f914a8dd4601f50ffb611adcc3 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:55:02 -0700 Subject: [PATCH 11/57] commit --- src/fastertransformer/models/llama/Llama.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 20a3be67b..fd2c67757 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1080,6 +1080,7 @@ void Llama::forward(std::unordered_map* output_ten for (int i=0; i < seq_len; i++) { printf("%d ", buf[i]); } + printf("\n"); for (int i=0; i < seq_len; i++) { printf("%d ", finish[i]); } From 9dc0329864e971dda7ed68cfbb80387c2dbdde02 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:59:01 -0700 Subject: [PATCH 12/57] commit --- examples/cpp/llama/llama_example.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 616dca009..469fbcd3d 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -373,7 +373,8 @@ void llama_example(const INIReader reader) {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &len_penalty}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, - {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}}; + {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}, + }; if (repetition_penalty != 1.0f) { input_tensors.insert( From 8f0b203e312b5810e051856439cb76aee50a9b9e Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 09:59:10 -0700 Subject: [PATCH 13/57] commit --- examples/cpp/llama/llama_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 469fbcd3d..28940c607 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -372,8 +372,8 @@ void llama_example(const INIReader reader) {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &temperature}}, {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &len_penalty}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, - {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, - {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}, + // {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, + // {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}, }; if (repetition_penalty != 1.0f) { From 441af5ea2a59dbfa71b9f9f969bb5f0f807f8537 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:02:36 -0700 Subject: [PATCH 14/57] commit --- .../layers/sampling_layers/TopKSamplingLayer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu index d0092e279..32f7d6317 100644 --- a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu +++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu @@ -244,7 +244,7 @@ void TopKSamplingLayer::runSampling(TensorMap* output_tensors, TensorMap* inp stream_); sync_check_cuda_error(); } - + printf("TopKSamplingLayer::runSampling\n"); invokeBatchTopKSampling( sampling_workspace_, sampling_workspace_size_, From 31383b4c6eb38723ed980a6b02cf5694ece9cca5 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:03:19 -0700 Subject: [PATCH 15/57] commit --- .../layers/sampling_layers/TopPSamplingLayer.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu index 2fa3e6b0f..2f61a12bb 100644 --- a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu +++ b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu @@ -278,6 +278,7 @@ void TopPSamplingLayer::runSampling(TensorMap* output_tensors, TensorMap* inp FT_CHECK(input_tensors->size() >= 4); FT_CHECK(output_tensors->size() >= 1); + printf("TopPSamplingLayer::runSampling\n"); const int batch_size = output_tensors->at("output_ids").shape[1]; const int local_batch_size = input_tensors->at("logits").shape[0]; const int step = input_tensors->at("step").getVal(); From b114cb8086bfbd9049ace4bba3b1c0313d4ab037 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:04:34 -0700 Subject: [PATCH 16/57] commit --- examples/cpp/llama/llama_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 28940c607..469fbcd3d 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -372,8 +372,8 @@ void llama_example(const INIReader reader) {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &temperature}}, {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &len_penalty}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, - // {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, - // {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}, + {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, + {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}, }; if (repetition_penalty != 1.0f) { From 2bf928e81e274be95bb4d5a6a4c35c05ea7dcf6c Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:08:25 -0700 Subject: [PATCH 17/57] commit --- src/fastertransformer/kernels/sampling_topk_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index 21df52687..90f33db70 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -307,6 +307,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, if (sequence_length != nullptr && finished != nullptr) { sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1; finished[batch_id] = ids[batch_id] == end_ids[batch_id] ? true : false; + printf("%d %d %d\n", finished[batch_id], ids[batch_id], end_ids[batch_id]); } } } From 47a55dc7089a0d259bf1bcb024516c49c6b4651d Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:12:27 -0700 Subject: [PATCH 18/57] commit --- examples/cpp/llama/llama_example.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 469fbcd3d..eff1fc363 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -214,6 +214,10 @@ void llama_example(const INIReader reader) cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len); cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size); } + for (int i=0; i< v_start_lengths.size(); i++) { + printf("%d ", v_start_lengths[i]); + } + printf("\n"); std::vector start_ids(request_batch_size, start_id); std::vector end_ids(request_batch_size, end_id); From d450e4f7a923b93456c35f9e97a6da09fad4d417 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:35:36 -0700 Subject: [PATCH 19/57] commit --- src/fastertransformer/models/llama/Llama.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index fd2c67757..327fb412e 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1274,6 +1274,12 @@ void Llama::setOutputTensors(std::unordered_map* o "The shape of cum_log_probs does not match with batch_size x beam_width."); cudaAutoCpy(cum_log_probs.getPtr(), cum_log_probs_, cum_log_probs.size(), stream_); } + invokeCalculateNewTokenLength(sequence_lengths, + input_lengths, + batch_size, + beam_width, + stream_); + sync_check_cuda_error(); } template From f2ae7bdcb3362bcdb0744b2da7995432ad56aa80 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:36:59 -0700 Subject: [PATCH 20/57] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index ef789d35d..1292bb707 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -9,7 +9,7 @@ model_name=llama_7b model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/ [request] -beam_width=1 # beam width for beam search +beam_width=2 # beam width for beam search top_k=1 ; k value for top k sampling top_p=0.0 ; p value for top p sampling temperature=1.0 ; Use for sampling From 4a780cba53d7defe98f0636bcf9ed1194469c418 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:39:20 -0700 Subject: [PATCH 21/57] commit --- .vscode/settings.json | 17 ++++++++++++++++- examples/cpp/llama/llama_example.cc | 3 +++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 82000232b..354d015db 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -68,6 +68,21 @@ "future": "cpp", "cfenv": "cpp", "typeindex": "cpp", - "variant": "cpp" + "variant": "cpp", + "__bit_reference": "cpp", + "__config": "cpp", + "__debug": "cpp", + "__errc": "cpp", + "__hash_table": "cpp", + "__locale": "cpp", + "__mutex_base": "cpp", + "__node_handle": "cpp", + "__split_buffer": "cpp", + "__threading_support": "cpp", + "__tree": "cpp", + "__verbose_abort": "cpp", + "charconv": "cpp", + "ios": "cpp", + "locale": "cpp" } } diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index eff1fc363..91f79967a 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -362,9 +362,12 @@ void llama_example(const INIReader reader) deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); std::vector output_seq_len(request_batch_size, total_output_len); + int beam_width = 2; std::unordered_map input_tensors = std::unordered_map{ {"input_ids", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, + {"beam_width", + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, &beam_width}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, // NOTE: if you need prefix prompts, remember to add prefix_prompt_task_ids here // {"prompt_learning_task_name_ids", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, From c163de820b0cf89c03b3306a24c4cf8113080fea Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:39:49 -0700 Subject: [PATCH 22/57] commit --- examples/cpp/llama/llama_example.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 91f79967a..6f8c2f21e 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -362,12 +362,12 @@ void llama_example(const INIReader reader) deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); std::vector output_seq_len(request_batch_size, total_output_len); - int beam_width = 2; + int beam_width2 = 2; std::unordered_map input_tensors = std::unordered_map{ {"input_ids", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, {"beam_width", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, &beam_width}}, + triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, &beam_width2}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, // NOTE: if you need prefix prompts, remember to add prefix_prompt_task_ids here // {"prompt_learning_task_name_ids", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, From d97d69cca1d09bc2c3c113f2ddffd62a3ced1239 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:40:10 -0700 Subject: [PATCH 23/57] commit --- examples/cpp/llama/llama_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 6f8c2f21e..1d8320f19 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -367,7 +367,7 @@ void llama_example(const INIReader reader) {"input_ids", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, {"beam_width", - triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector{1}, &beam_width2}}, + Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &beam_width2}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, // NOTE: if you need prefix prompts, remember to add prefix_prompt_task_ids here // {"prompt_learning_task_name_ids", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, From 48b72a9360d9a79f908443ee205e9eefc0fdc132 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 10:42:01 -0700 Subject: [PATCH 24/57] commit --- examples/cpp/llama/llama_example.cc | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 1d8320f19..ce761f75f 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -93,7 +93,6 @@ void llama_example(const INIReader reader) const float layernorm_eps = reader.GetFloat(model_name, "layernorm_eps"); const int start_id = reader.GetInteger(model_name, "start_id"); const int end_id = reader.GetInteger(model_name, "end_id"); - printf("end_id: %d\n", end_id); const size_t hidden_units = head_num * size_per_head; const size_t inter_size = reader.GetInteger(model_name, "inter_size"); @@ -214,10 +213,6 @@ void llama_example(const INIReader reader) cudaH2Dcpy(d_input_ids, v_start_ids.data(), request_batch_size * max_input_len); cudaH2Dcpy(d_input_lengths, v_start_lengths.data(), request_batch_size); } - for (int i=0; i< v_start_lengths.size(); i++) { - printf("%d ", v_start_lengths[i]); - } - printf("\n"); std::vector start_ids(request_batch_size, start_id); std::vector end_ids(request_batch_size, end_id); @@ -362,12 +357,9 @@ void llama_example(const INIReader reader) deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); std::vector output_seq_len(request_batch_size, total_output_len); - int beam_width2 = 2; std::unordered_map input_tensors = std::unordered_map{ {"input_ids", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size, (size_t)max_input_len}, d_input_ids}}, - {"beam_width", - Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &beam_width2}}, {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, std::vector{request_batch_size}, d_input_lengths}}, // NOTE: if you need prefix prompts, remember to add prefix_prompt_task_ids here // {"prompt_learning_task_name_ids", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, @@ -380,8 +372,7 @@ void llama_example(const INIReader reader) {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &len_penalty}}, {"min_length", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{1}, &min_length}}, {"start_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, start_ids.data()}}, - {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}, - }; + {"end_id", Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, end_ids.data()}}}; if (repetition_penalty != 1.0f) { input_tensors.insert( @@ -465,9 +456,14 @@ void llama_example(const INIReader reader) size_t seqLCount = request_batch_size * beam_width; int* seqlBuf = new int[seqLCount]; + size_t inLCount = request_batch_size * beam_width; + int* inlBuf = new int[inLCount]; + cudaD2Hcpy(hBuf, d_output_ids, outCount); cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount); + cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount); printf("seqlBuf: %d\n", seqlBuf[0]); + { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; @@ -489,9 +485,6 @@ void llama_example(const INIReader reader) } std::cout << std::endl << "zeroCount = " << zeroCount << std::endl; } - for (int i=0; i Date: Thu, 12 Oct 2023 10:43:45 -0700 Subject: [PATCH 25/57] commit --- examples/cpp/llama/llama_example.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index ce761f75f..a2cdbfd80 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -456,13 +456,13 @@ void llama_example(const INIReader reader) size_t seqLCount = request_batch_size * beam_width; int* seqlBuf = new int[seqLCount]; - size_t inLCount = request_batch_size * beam_width; - int* inlBuf = new int[inLCount]; - cudaD2Hcpy(hBuf, d_output_ids, outCount); cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount); - cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount); - printf("seqlBuf: %d\n", seqlBuf[0]); + printf("seq len: "); + for (int i=0; i Date: Thu, 12 Oct 2023 20:14:14 -0700 Subject: [PATCH 26/57] commit --- src/fastertransformer/kernels/sampling_topk_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index 90f33db70..027030c66 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -247,6 +247,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, TopK_2 partial; if (finished != nullptr && finished[batch_id] == true) { + printf("batch id: %d", batch_id); ids[batch_id] = end_ids[batch_id]; return; } From b39167aa95df25d3bd8680cce29d8f735bc75c36 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:16:05 -0700 Subject: [PATCH 27/57] commit --- src/fastertransformer/kernels/sampling_topk_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index 027030c66..f55d2a913 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -245,7 +245,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, s_sum = 0.0f; } TopK_2 partial; - + printf("end id: %d\n", end_ids[batch_id]); if (finished != nullptr && finished[batch_id] == true) { printf("batch id: %d", batch_id); ids[batch_id] = end_ids[batch_id]; From 7e7101fbbb5592f6144578a213e6ad988c74b86d Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:20:59 -0700 Subject: [PATCH 28/57] commit --- .../kernels/sampling_topk_kernels.cu | 2 +- src/fastertransformer/models/llama/Llama.cc | 23 +++---------------- 2 files changed, 4 insertions(+), 21 deletions(-) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index f55d2a913..2dbced267 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -245,7 +245,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, s_sum = 0.0f; } TopK_2 partial; - printf("end id: %d\n", end_ids[batch_id]); + // printf("end id: %d\n", end_ids[batch_id]); if (finished != nullptr && finished[batch_id] == true) { printf("batch id: %d", batch_id); ids[batch_id] = end_ids[batch_id]; diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 327fb412e..b0981b880 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1066,26 +1066,6 @@ void Llama::forward(std::unordered_map* output_ten } dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); - { - int* buf; - bool* finish; - int seq_len = batch_size * beam_width; - int st = seq_len; - buf = new int[st]; - finish = new bool[st]; - cudaMemcpy(buf, sequence_lengths_, sizeof(int) * st, cudaMemcpyDeviceToHost); - cudaMemcpy(finish, finished_buf_, sizeof(bool) * st, cudaMemcpyDeviceToHost); - - printf("seq_len at step: %d\n", step); - for (int i=0; i < seq_len; i++) { - printf("%d ", buf[i]); - } - printf("\n"); - for (int i=0; i < seq_len; i++) { - printf("%d ", finish[i]); - } - printf("\n"); - } *generation_should_stop_ &= subbatch_should_stop; } } @@ -1141,6 +1121,9 @@ void Llama::forward(std::unordered_map* output_ten beam_width, stream_); } + { + + } } setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); From b8dac84656ace3d784b6fced470d7dc89610f7bc Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:23:09 -0700 Subject: [PATCH 29/57] commit --- src/fastertransformer/models/llama/Llama.cc | 23 ++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index b0981b880..ce65fedd6 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1066,6 +1066,26 @@ void Llama::forward(std::unordered_map* output_ten } dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); + { + int* buf; + bool* finish; + int seq_len = batch_size * beam_width; + int st = seq_len; + buf = new int[st]; + finish = new bool[st]; + cudaMemcpy(buf, sequence_lengths_, sizeof(int) * st, cudaMemcpyDeviceToHost); + cudaMemcpy(finish, finished_buf_, sizeof(bool) * st, cudaMemcpyDeviceToHost); + + printf("seq_len at step: %d\n", step); + for (int i=0; i < seq_len; i++) { + printf("%d ", buf[i]); + } + printf("\n"); + for (int i=0; i < seq_len; i++) { + printf("%d ", finish[i]); + } + printf("\n"); + } *generation_should_stop_ &= subbatch_should_stop; } } @@ -1121,9 +1141,6 @@ void Llama::forward(std::unordered_map* output_ten beam_width, stream_); } - { - - } } setOutputTensors(output_tensors, input_tensors, max_input_length, max_output_seq_len); From cb74cfc434695aa315bd6e935d1e131e8dc92620 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:27:17 -0700 Subject: [PATCH 30/57] commit --- src/fastertransformer/models/llama/Llama.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index ce65fedd6..d8dd6ba7f 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1069,11 +1069,15 @@ void Llama::forward(std::unordered_map* output_ten { int* buf; bool* finish; + int* id_buf; int seq_len = batch_size * beam_width; int st = seq_len; + int st2 = max_seq_len * batch_size * beam_width; buf = new int[st]; + id_buf = new int[st2]; finish = new bool[st]; cudaMemcpy(buf, sequence_lengths_, sizeof(int) * st, cudaMemcpyDeviceToHost); + cudaMemcpy(id_buf, output_ids_buf_, sizeof(int) * st2, cudaMemcpyDeviceToHost); cudaMemcpy(finish, finished_buf_, sizeof(bool) * st, cudaMemcpyDeviceToHost); printf("seq_len at step: %d\n", step); @@ -1085,6 +1089,13 @@ void Llama::forward(std::unordered_map* output_ten printf("%d ", finish[i]); } printf("\n"); + printf("ids: \n"); + for (int i=0; i < batch_size; i++) { + for (int j=0; j Date: Thu, 12 Oct 2023 20:28:09 -0700 Subject: [PATCH 31/57] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index d8dd6ba7f..fd4b19078 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1092,7 +1092,7 @@ void Llama::forward(std::unordered_map* output_ten printf("ids: \n"); for (int i=0; i < batch_size; i++) { for (int j=0; j Date: Thu, 12 Oct 2023 20:30:18 -0700 Subject: [PATCH 32/57] commit --- examples/cpp/llama/llama_example.cc | 2 +- examples/cpp/llama/start_ids.csv | 1 + src/fastertransformer/kernels/sampling_topk_kernels.cu | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index a2cdbfd80..ed81e74e6 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -196,7 +196,7 @@ void llama_example(const INIReader reader) max_input_len, end_id, 1, - "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv"); + "/notebooks/tmp/FasterTransformer/examples/cpp/llama/start_ids.csv"); int* d_input_ids; diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv index 612c85964..c072892c6 100644 --- a/examples/cpp/llama/start_ids.csv +++ b/examples/cpp/llama/start_ids.csv @@ -1 +1,2 @@ 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973 +1, 18637 diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index 2dbced267..cba3c9308 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -308,7 +308,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, if (sequence_length != nullptr && finished != nullptr) { sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1; finished[batch_id] = ids[batch_id] == end_ids[batch_id] ? true : false; - printf("%d %d %d\n", finished[batch_id], ids[batch_id], end_ids[batch_id]); + // printf("%d %d %d\n", finished[batch_id], ids[batch_id], end_ids[batch_id]); } } } From 27b843a276d73530a4e896e8352d26d6f5b4c850 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:30:43 -0700 Subject: [PATCH 33/57] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 1292bb707..2ce32e5de 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -17,7 +17,7 @@ repetition_penalty=1.0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 -request_batch_size=8 # determine by the request +request_batch_size=2 # determine by the request request_output_len=32 # determine by the request [llama_7b] From 4cd3026c57af9160bdbad830573fe206e71db645 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:32:11 -0700 Subject: [PATCH 34/57] commit --- examples/cpp/multi_gpu_gpt/gpt_example_utils.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc index 5980e75ba..6f23d6a5b 100644 --- a/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc +++ b/examples/cpp/multi_gpu_gpt/gpt_example_utils.cc @@ -88,7 +88,7 @@ int read_start_ids(size_t batch_size, for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) { v_start_ids->push_back(tmp_start_ids[i][j]); } - printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]); + // printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]); v_start_lengths->push_back(tmp_start_lengths[i]); } } From e3f14f3126e90516cbc3b2677e46e66e373ceba4 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:33:43 -0700 Subject: [PATCH 35/57] commit --- examples/cpp/llama/llama_example.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index ed81e74e6..7f3007a98 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -44,7 +44,7 @@ int main(int argc, char* argv[]) ini_name = std::string(argv[1]); } else { - ini_name = "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini"; + ini_name = "/notebooks/tmp/FasterTransformer/examples/cpp/llama/llama_config.ini"; } INIReader reader = INIReader(ini_name); From 882a4300865e402bc9c2b7482b435b990789a8ec Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:34:44 -0700 Subject: [PATCH 36/57] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index 2ce32e5de..ef5364181 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -9,7 +9,7 @@ model_name=llama_7b model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/ [request] -beam_width=2 # beam width for beam search +beam_width=1 # beam width for beam search top_k=1 ; k value for top k sampling top_p=0.0 ; p value for top p sampling temperature=1.0 ; Use for sampling From 5c561c154378d05ba85c44eeadf301cf3371f934 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:35:17 -0700 Subject: [PATCH 37/57] commit --- examples/cpp/llama/llama_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini index ef5364181..4e26ea5a9 100644 --- a/examples/cpp/llama/llama_config.ini +++ b/examples/cpp/llama/llama_config.ini @@ -10,7 +10,7 @@ model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/ [request] beam_width=1 # beam width for beam search -top_k=1 ; k value for top k sampling +top_k=0 ; k value for top k sampling top_p=0.0 ; p value for top p sampling temperature=1.0 ; Use for sampling repetition_penalty=1.0 ; Use for sampling From bee965ea8d4daa9227b9baf5a989a7f8a3eaec40 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:36:06 -0700 Subject: [PATCH 38/57] commit --- examples/cpp/llama/llama_example.cc | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 7f3007a98..a23eee7fa 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -390,20 +390,20 @@ void llama_example(const INIReader reader) Tensor{MEMORY_CPU, TYPE_INT32, std::vector{request_batch_size}, prefix_prompt_task_ids.data()}}); } - if (top_k == 0 && top_p == 0.0f) { - FT_CHECK(beam_width > 1); - input_tensors.insert({"beam_search_diversity_rate", - Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); - } - else { - input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); - if (top_p != 0.0f) { - input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); - } - if (top_k != 0) { - input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); - } - } + // if (top_k == 0 && top_p == 0.0f) { + // FT_CHECK(beam_width > 1); + // input_tensors.insert({"beam_search_diversity_rate", + // Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); + // } + // else { + // input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + // if (top_p != 0.0f) { + // input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); + // } + // if (top_k != 0) { + // input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); + // } + // } std::unordered_map output_tensors = std::unordered_map{ {"output_ids", From 7d62c7f3c5ae3f5cd995d56575a2727a6d09261b Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:37:51 -0700 Subject: [PATCH 39/57] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index fd4b19078..91a0d62a3 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1092,7 +1092,7 @@ void Llama::forward(std::unordered_map* output_ten printf("ids: \n"); for (int i=0; i < batch_size; i++) { for (int j=0; j Date: Thu, 12 Oct 2023 20:40:26 -0700 Subject: [PATCH 40/57] commit --- src/fastertransformer/models/llama/Llama.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 91a0d62a3..01f78beca 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1092,7 +1092,7 @@ void Llama::forward(std::unordered_map* output_ten printf("ids: \n"); for (int i=0; i < batch_size; i++) { for (int j=0; j Date: Thu, 12 Oct 2023 20:45:46 -0700 Subject: [PATCH 41/57] commit --- examples/cpp/gpt/gpt_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/gpt/gpt_config.ini b/examples/cpp/gpt/gpt_config.ini index ebeb066ad..0586f5c3f 100644 --- a/examples/cpp/gpt/gpt_config.ini +++ b/examples/cpp/gpt/gpt_config.ini @@ -17,7 +17,7 @@ model_name=gpt_124M ; model_name=gpt_175B ; model_name=self_defined ; model_dir=./models/megatron-models/c-model/6.7b/ -model_dir=models/openai-gpt-models/c-model/124m/1-gpu/ +model_dir=/notebooks/ft_gpt2/1-gpu/ shared_contexts_ratio=1.0 [request] From f7631a54ac0c721ca803f7d3b00710ac3865b53c Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:48:40 -0700 Subject: [PATCH 42/57] commit --- examples/cpp/gpt/gpt_example.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/cpp/gpt/gpt_example.cc b/examples/cpp/gpt/gpt_example.cc index 2f4b29799..9b3d3df2a 100644 --- a/examples/cpp/gpt/gpt_example.cc +++ b/examples/cpp/gpt/gpt_example.cc @@ -452,10 +452,8 @@ void gpt_example(const INIReader reader) outFile << std::endl; } - if (i < 10) { - printf("%5d ", hBuf[i]); - } - if ((i + 1) % (total_output_len) == 0 && i < 10) { + printf("%5d ", hBuf[i]); + if ((i + 1) % (total_output_len) == 0) { std::cout << std::endl; } } From 092e00e8411532ab67095629015e802bf2ddd355 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:49:32 -0700 Subject: [PATCH 43/57] commit --- examples/cpp/gpt/gpt_config.ini | 2 +- examples/cpp/gpt/start_ids.csv | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/examples/cpp/gpt/gpt_config.ini b/examples/cpp/gpt/gpt_config.ini index 0586f5c3f..00d5745d5 100644 --- a/examples/cpp/gpt/gpt_config.ini +++ b/examples/cpp/gpt/gpt_config.ini @@ -21,7 +21,7 @@ model_dir=/notebooks/ft_gpt2/1-gpu/ shared_contexts_ratio=1.0 [request] -request_batch_size=8 ; determine by the request +request_batch_size=2 ; determine by the request request_output_len=32 ; determine by the request return_log_probs=false ; return the output log probs and cumulative log probs. context_log_probs=false ; include input contexts in the cumulative log probability computation. diff --git a/examples/cpp/gpt/start_ids.csv b/examples/cpp/gpt/start_ids.csv index 38c1beb0d..35aa59042 100644 --- a/examples/cpp/gpt/start_ids.csv +++ b/examples/cpp/gpt/start_ids.csv @@ -1,8 +1,2 @@ 818, 262, 938, 3155, 286, 1528, 11, 257 -198, 464, 968, 8221, 2732, 286, 15198, 318 -464, 968, 1971, 12056, 423, 257, 649, 1182 -464, 968, 1971, 3782, 468, 3199, 663, 5079 -818, 257, 1445, 326, 481, 1884, 787, 340 -464, 968, 1971, 12056, 6, 5859, 41683, 423 -198, 198, 464, 5398, 4332, 628, 628, 198 -464, 717, 640, 314, 2497, 262, 3807, 11 +198 From 30521510d80a1a1698566e9b61f0fb4290efa574 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:50:01 -0700 Subject: [PATCH 44/57] commit --- examples/cpp/gpt/gpt_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/gpt/gpt_config.ini b/examples/cpp/gpt/gpt_config.ini index 00d5745d5..8602046f7 100644 --- a/examples/cpp/gpt/gpt_config.ini +++ b/examples/cpp/gpt/gpt_config.ini @@ -3,7 +3,7 @@ max_batch_size=8 ; Use for allocate the buffer max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter beam_width=1 ; beam width for beam search top_k=0 ; k value for top k sampling -top_p=0.5 ; p value for top p sampling +top_p=0 ; p value for top p sampling temperature=1.0 ; Use for sampling repetition_penalty=2.0 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. From d9ce51437b64452a22ad61fe1072bd627996a2c7 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:50:27 -0700 Subject: [PATCH 45/57] commit --- examples/cpp/gpt/gpt_example.cc | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/cpp/gpt/gpt_example.cc b/examples/cpp/gpt/gpt_example.cc index 9b3d3df2a..47dc1592f 100644 --- a/examples/cpp/gpt/gpt_example.cc +++ b/examples/cpp/gpt/gpt_example.cc @@ -343,20 +343,20 @@ void gpt_example(const INIReader reader) input_tensors.insert( {"presence_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &presence_penalty}}); } - if (top_k == 0 && top_p == 0.0f) { - FT_CHECK(beam_width > 1); - input_tensors.insert({"beam_search_diversity_rate", - Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); - } - else { - input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); - if (top_p != 0.0f) { - input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); - } - if (top_k != 0) { - input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); - } - } + // if (top_k == 0 && top_p == 0.0f) { + // FT_CHECK(beam_width > 1); + // input_tensors.insert({"beam_search_diversity_rate", + // Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &beam_search_diversity_rate}}); + // } + // else { + // input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector{1}, &random_seed}}); + // if (top_p != 0.0f) { + // input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector{1}, &top_p}}); + // } + // if (top_k != 0) { + // input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector{1}, &top_k}}); + // } + // } std::unordered_map output_tensors = std::unordered_map{ {"output_ids", From 05371893f162cec442acbdc4531111feca052de2 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:51:08 -0700 Subject: [PATCH 46/57] commit --- .../layers/sampling_layers/TopKSamplingLayer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu index 32f7d6317..9f2c7155b 100644 --- a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu +++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu @@ -244,7 +244,7 @@ void TopKSamplingLayer::runSampling(TensorMap* output_tensors, TensorMap* inp stream_); sync_check_cuda_error(); } - printf("TopKSamplingLayer::runSampling\n"); + // printf("TopKSamplingLayer::runSampling\n"); invokeBatchTopKSampling( sampling_workspace_, sampling_workspace_size_, From 115a949ff50a2c32d8053cfa7d15389f37c1fac6 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:52:31 -0700 Subject: [PATCH 47/57] commit --- src/fastertransformer/kernels/decoding_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu index 88e8151f2..d62dd49b7 100644 --- a/src/fastertransformer/kernels/decoding_kernels.cu +++ b/src/fastertransformer/kernels/decoding_kernels.cu @@ -39,7 +39,7 @@ __global__ void decodingInitialize(bool* finished, index += blockDim.x * gridDim.x) { finished[index] = false; sequence_length[index] = max_input_length; - printf("index %d length: %d\n", index, max_input_length); + // printf("index %d length: %d\n", index, max_input_length); if (word_ids != nullptr) { word_ids[index] = sentence_ids[index / beam_width]; } From 762efd13d96e2256b815acaca072c23de6fadf35 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 20:54:01 -0700 Subject: [PATCH 48/57] commit --- examples/cpp/gpt/gpt_example.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/cpp/gpt/gpt_example.cc b/examples/cpp/gpt/gpt_example.cc index 47dc1592f..b1540ea13 100644 --- a/examples/cpp/gpt/gpt_example.cc +++ b/examples/cpp/gpt/gpt_example.cc @@ -439,9 +439,16 @@ void gpt_example(const INIReader reader) size_t outCount = total_output_len * request_batch_size * beam_width; int* hBuf = new int[outCount]; cudaD2Hcpy(hBuf, d_output_ids, outCount); + size_t seqLenCount = request_batch_size * beam_width; + int* hBuf2 = new int[seqLenCount]; + cudaD2Hcpy(hBuf2, d_sequence_lengths, seqLenCount); { std::cout << "Writing " << outCount << " elements\n"; + for (int i=0; i Date: Thu, 12 Oct 2023 21:02:58 -0700 Subject: [PATCH 49/57] commit --- src/fastertransformer/kernels/sampling_topk_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index cba3c9308..2dbced267 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -308,7 +308,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, if (sequence_length != nullptr && finished != nullptr) { sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1; finished[batch_id] = ids[batch_id] == end_ids[batch_id] ? true : false; - // printf("%d %d %d\n", finished[batch_id], ids[batch_id], end_ids[batch_id]); + printf("%d %d %d\n", finished[batch_id], ids[batch_id], end_ids[batch_id]); } } } From dd23f209e7cb8a643a5923bbd5b755cf81189f37 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:03:46 -0700 Subject: [PATCH 50/57] commit --- src/fastertransformer/kernels/sampling_topk_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index 2dbced267..fc83631c1 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -225,6 +225,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, const int tid = threadIdx.x; const int batch_id = blockIdx.x; if (skip_decode != nullptr && skip_decode[batch_id]) { + printf("skip decode\n"); return; } From 7d83aff4ca7d9780c5893bb6c8232236bfe6beee Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:04:30 -0700 Subject: [PATCH 51/57] commit --- src/fastertransformer/kernels/sampling_topk_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu index fc83631c1..68852dde0 100644 --- a/src/fastertransformer/kernels/sampling_topk_kernels.cu +++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu @@ -309,7 +309,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, if (sequence_length != nullptr && finished != nullptr) { sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1; finished[batch_id] = ids[batch_id] == end_ids[batch_id] ? true : false; - printf("%d %d %d\n", finished[batch_id], ids[batch_id], end_ids[batch_id]); + printf("batch %d: %d %d %d\n", batch_id, finished[batch_id], ids[batch_id], end_ids[batch_id]); } } } From a96e57d90e4ad0a1a39e92b56950cd3f35f26320 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:07:22 -0700 Subject: [PATCH 52/57] commit --- examples/cpp/multi_gpu_gpt/gpt_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/multi_gpu_gpt/gpt_config.ini b/examples/cpp/multi_gpu_gpt/gpt_config.ini index 67f8330a8..07741ba28 100644 --- a/examples/cpp/multi_gpu_gpt/gpt_config.ini +++ b/examples/cpp/multi_gpu_gpt/gpt_config.ini @@ -32,7 +32,7 @@ shared_contexts_ratio=1.0 [request] request_batch_size=8 ; determine by the request -request_output_len=32 ; determine by the request +request_output_len=50 ; determine by the request return_log_probs=false ; return the output log probs and cumulative log probs. context_log_probs=false ; include input contexts in the cumulative log probability computation. remove_padding=true From 59fb43dabd29eee9b29920ccd077801825308a81 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:07:46 -0700 Subject: [PATCH 53/57] commit --- examples/cpp/gpt/gpt_config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/gpt/gpt_config.ini b/examples/cpp/gpt/gpt_config.ini index 8602046f7..efbee1d44 100644 --- a/examples/cpp/gpt/gpt_config.ini +++ b/examples/cpp/gpt/gpt_config.ini @@ -22,7 +22,7 @@ shared_contexts_ratio=1.0 [request] request_batch_size=2 ; determine by the request -request_output_len=32 ; determine by the request +request_output_len=50 ; determine by the request return_log_probs=false ; return the output log probs and cumulative log probs. context_log_probs=false ; include input contexts in the cumulative log probability computation. From 3311fbad38e05992d452da5c1c931fb5b822a731 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:34:41 -0700 Subject: [PATCH 54/57] commit --- src/fastertransformer/models/llama/Llama.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 01f78beca..3982aff38 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1285,6 +1285,7 @@ void Llama::setOutputTensors(std::unordered_map* o "The shape of cum_log_probs does not match with batch_size x beam_width."); cudaAutoCpy(cum_log_probs.getPtr(), cum_log_probs_, cum_log_probs.size(), stream_); } + printf("max_input_length: %d\n", max_input_length); invokeCalculateNewTokenLength(sequence_lengths, input_lengths, batch_size, From 287d5988cff8dff3ced5fbb819363dfdb6c7eb59 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:36:23 -0700 Subject: [PATCH 55/57] commit --- src/fastertransformer/kernels/gpt_kernels.cu | 6 ++++-- src/fastertransformer/kernels/gpt_kernels.h | 1 + src/fastertransformer/models/llama/Llama.cc | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu index b4982220e..913f9ab1a 100644 --- a/src/fastertransformer/kernels/gpt_kernels.cu +++ b/src/fastertransformer/kernels/gpt_kernels.cu @@ -570,17 +570,19 @@ void invokeTileGptInputs(int* tiled_input_ids, __global__ void calculateNewTokenLength(int* output_lengths, const int* input_lengths, + const int max_input_length, const int batch_size, const int beam_width) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width; index += blockDim.x * gridDim.x) { - output_lengths[index] -= input_lengths[index / beam_width]; + output_lengths[index] -= max_input_length - input_lengths[index / beam_width]; } } void invokeCalculateNewTokenLength(int* output_lengths, const int* input_lengths, + const int max_input_length, const int batch_size, const int beam_width, cudaStream_t stream) { @@ -588,7 +590,7 @@ void invokeCalculateNewTokenLength(int* output_lengths, dim3 block(256); calculateNewTokenLength<<>>( - output_lengths, input_lengths, batch_size, beam_width); + output_lengths, input_lengths, max_input_length, batch_size, beam_width); } diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h index 9831fc687..74532f23b 100644 --- a/src/fastertransformer/kernels/gpt_kernels.h +++ b/src/fastertransformer/kernels/gpt_kernels.h @@ -123,6 +123,7 @@ void invokeTileGptInputs(int* tiled_input_ids, void invokeCalculateNewTokenLength(int* output_lengths, const int* input_lengths, + const int max_input_length, const int batch_size, const int beam_width, cudaStream_t stream); diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 3982aff38..cbe59a91f 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -1288,6 +1288,7 @@ void Llama::setOutputTensors(std::unordered_map* o printf("max_input_length: %d\n", max_input_length); invokeCalculateNewTokenLength(sequence_lengths, input_lengths, + max_input_length, batch_size, beam_width, stream_); From e959d151f7ecdfae3ab557266d598785af8d8015 Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 21:59:27 -0700 Subject: [PATCH 56/57] commit --- src/fastertransformer/kernels/decoding_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu index d62dd49b7..34970afaa 100644 --- a/src/fastertransformer/kernels/decoding_kernels.cu +++ b/src/fastertransformer/kernels/decoding_kernels.cu @@ -39,7 +39,6 @@ __global__ void decodingInitialize(bool* finished, index += blockDim.x * gridDim.x) { finished[index] = false; sequence_length[index] = max_input_length; - // printf("index %d length: %d\n", index, max_input_length); if (word_ids != nullptr) { word_ids[index] = sentence_ids[index / beam_width]; } @@ -528,6 +527,7 @@ __global__ void gatherTree(gatherTreeParam param) param.max_sequence_lengths[batch * param.beam_width + j] = tmp_len - param.max_prefix_soft_prompt_length - (param.max_input_length - param.max_input_without_prompt_length); + printf("a: %d b: %d\n", param.max_sequence_lengths[batch * param.beam_width + j], tmp_len); // update the response input length if (update_response_input_length) { param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len; From 8fac5b0c728a813e7789d910975a8e08f6593cda Mon Sep 17 00:00:00 2001 From: sfc-gh-zhwang Date: Thu, 12 Oct 2023 22:00:55 -0700 Subject: [PATCH 57/57] commit --- src/fastertransformer/kernels/decoding_kernels.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu index 34970afaa..7e7b13b4c 100644 --- a/src/fastertransformer/kernels/decoding_kernels.cu +++ b/src/fastertransformer/kernels/decoding_kernels.cu @@ -528,6 +528,7 @@ __global__ void gatherTree(gatherTreeParam param) tmp_len - param.max_prefix_soft_prompt_length - (param.max_input_length - param.max_input_without_prompt_length); printf("a: %d b: %d\n", param.max_sequence_lengths[batch * param.beam_width + j], tmp_len); + printf("%d %d\n", param.max_input_length, param.max_input_without_prompt_length); // update the response input length if (update_response_input_length) { param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len;