Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zhwang/output len #27

Open
wants to merge 57 commits into
base: corvo
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
b04253c
commit
sfc-gh-zhwang Oct 12, 2023
72199b3
commit
sfc-gh-zhwang Oct 12, 2023
3446b20
commit
sfc-gh-zhwang Oct 12, 2023
fed0fd9
commit
sfc-gh-zhwang Oct 12, 2023
fc983ae
commit
sfc-gh-zhwang Oct 12, 2023
28f736f
commit
sfc-gh-zhwang Oct 12, 2023
59d547c
commit
sfc-gh-zhwang Oct 12, 2023
7f411be
commit
sfc-gh-zhwang Oct 12, 2023
58f90cf
commit
sfc-gh-zhwang Oct 12, 2023
3c6db02
commit
sfc-gh-zhwang Oct 12, 2023
8c13f1d
commit
sfc-gh-zhwang Oct 12, 2023
9dc0329
commit
sfc-gh-zhwang Oct 12, 2023
8f0b203
commit
sfc-gh-zhwang Oct 12, 2023
441af5e
commit
sfc-gh-zhwang Oct 12, 2023
31383b4
commit
sfc-gh-zhwang Oct 12, 2023
b114cb8
commit
sfc-gh-zhwang Oct 12, 2023
2bf928e
commit
sfc-gh-zhwang Oct 12, 2023
47a55dc
commit
sfc-gh-zhwang Oct 12, 2023
d450e4f
commit
sfc-gh-zhwang Oct 12, 2023
f2ae7bd
commit
sfc-gh-zhwang Oct 12, 2023
4a780cb
commit
sfc-gh-zhwang Oct 12, 2023
c163de8
commit
sfc-gh-zhwang Oct 12, 2023
d97d69c
commit
sfc-gh-zhwang Oct 12, 2023
48b72a9
commit
sfc-gh-zhwang Oct 12, 2023
5407d15
commit
sfc-gh-zhwang Oct 12, 2023
a5e7793
commit
sfc-gh-zhwang Oct 13, 2023
b39167a
commit
sfc-gh-zhwang Oct 13, 2023
7e7101f
commit
sfc-gh-zhwang Oct 13, 2023
b8dac84
commit
sfc-gh-zhwang Oct 13, 2023
cb74cfc
commit
sfc-gh-zhwang Oct 13, 2023
4162ce5
commit
sfc-gh-zhwang Oct 13, 2023
59d662f
commit
sfc-gh-zhwang Oct 13, 2023
27b843a
commit
sfc-gh-zhwang Oct 13, 2023
4cd3026
commit
sfc-gh-zhwang Oct 13, 2023
e3f14f3
commit
sfc-gh-zhwang Oct 13, 2023
882a430
commit
sfc-gh-zhwang Oct 13, 2023
5c561c1
commit
sfc-gh-zhwang Oct 13, 2023
bee965e
commit
sfc-gh-zhwang Oct 13, 2023
7d62c7f
commit
sfc-gh-zhwang Oct 13, 2023
75e11f4
commit
sfc-gh-zhwang Oct 13, 2023
5b365c9
commit
sfc-gh-zhwang Oct 13, 2023
f7631a5
commit
sfc-gh-zhwang Oct 13, 2023
092e00e
commit
sfc-gh-zhwang Oct 13, 2023
3052151
commit
sfc-gh-zhwang Oct 13, 2023
d9ce514
commit
sfc-gh-zhwang Oct 13, 2023
0537189
commit
sfc-gh-zhwang Oct 13, 2023
115a949
commit
sfc-gh-zhwang Oct 13, 2023
762efd1
commit
sfc-gh-zhwang Oct 13, 2023
caadb53
commit
sfc-gh-zhwang Oct 13, 2023
dd23f20
commit
sfc-gh-zhwang Oct 13, 2023
7d83aff
commit
sfc-gh-zhwang Oct 13, 2023
a96e57d
commit
sfc-gh-zhwang Oct 13, 2023
59fb43d
commit
sfc-gh-zhwang Oct 13, 2023
3311fba
commit
sfc-gh-zhwang Oct 13, 2023
287d598
commit
sfc-gh-zhwang Oct 13, 2023
e959d15
commit
sfc-gh-zhwang Oct 13, 2023
8fac5b0
commit
sfc-gh-zhwang Oct 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,22 @@
"unordered_set": "cpp",
"future": "cpp",
"cfenv": "cpp",
"typeindex": "cpp"
"typeindex": "cpp",
"variant": "cpp",
"__bit_reference": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__errc": "cpp",
"__hash_table": "cpp",
"__locale": "cpp",
"__mutex_base": "cpp",
"__node_handle": "cpp",
"__split_buffer": "cpp",
"__threading_support": "cpp",
"__tree": "cpp",
"__verbose_abort": "cpp",
"charconv": "cpp",
"ios": "cpp",
"locale": "cpp"
}
}
}
8 changes: 4 additions & 4 deletions examples/cpp/gpt/gpt_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ max_batch_size=8 ; Use for allocate the buffer
max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter
beam_width=1 ; beam width for beam search
top_k=0 ; k value for top k sampling
top_p=0.5 ; p value for top p sampling
top_p=0 ; p value for top p sampling
temperature=1.0 ; Use for sampling
repetition_penalty=2.0 ; Use for sampling
presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
Expand All @@ -17,12 +17,12 @@ model_name=gpt_124M
; model_name=gpt_175B
; model_name=self_defined
; model_dir=./models/megatron-models/c-model/6.7b/
model_dir=models/openai-gpt-models/c-model/124m/1-gpu/
model_dir=/notebooks/ft_gpt2/1-gpu/
shared_contexts_ratio=1.0

[request]
request_batch_size=8 ; determine by the request
request_output_len=32 ; determine by the request
request_batch_size=2 ; determine by the request
request_output_len=50 ; determine by the request
return_log_probs=false ; return the output log probs and cumulative log probs.
context_log_probs=false ; include input contexts in the cumulative log probability computation.

Expand Down
41 changes: 23 additions & 18 deletions examples/cpp/gpt/gpt_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,20 +343,20 @@ void gpt_example(const INIReader reader)
input_tensors.insert(
{"presence_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &presence_penalty}});
}
if (top_k == 0 && top_p == 0.0f) {
FT_CHECK(beam_width > 1);
input_tensors.insert({"beam_search_diversity_rate",
Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
}
else {
input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
if (top_p != 0.0f) {
input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
}
if (top_k != 0) {
input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
}
}
// if (top_k == 0 && top_p == 0.0f) {
// FT_CHECK(beam_width > 1);
// input_tensors.insert({"beam_search_diversity_rate",
// Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
// }
// else {
// input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
// if (top_p != 0.0f) {
// input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
// }
// if (top_k != 0) {
// input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
// }
// }

std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
{"output_ids",
Expand Down Expand Up @@ -439,9 +439,16 @@ void gpt_example(const INIReader reader)
size_t outCount = total_output_len * request_batch_size * beam_width;
int* hBuf = new int[outCount];
cudaD2Hcpy(hBuf, d_output_ids, outCount);
size_t seqLenCount = request_batch_size * beam_width;
int* hBuf2 = new int[seqLenCount];
cudaD2Hcpy(hBuf2, d_sequence_lengths, seqLenCount);

{
std::cout << "Writing " << outCount << " elements\n";
for (int i=0; i<seqLenCount; i++) {
printf("%d ", hBuf2[i]);
}
printf("\n");
int zeroCount = 0;
for (size_t i = 0; i < outCount; i++) {
if (hBuf[i] == int(0)) {
Expand All @@ -452,10 +459,8 @@ void gpt_example(const INIReader reader)
outFile << std::endl;
}

if (i < 10) {
printf("%5d ", hBuf[i]);
}
if ((i + 1) % (total_output_len) == 0 && i < 10) {
printf("%5d ", hBuf[i]);
if ((i + 1) % (total_output_len) == 0) {
std::cout << std::endl;
}
}
Expand Down
8 changes: 1 addition & 7 deletions examples/cpp/gpt/start_ids.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,2 @@
818, 262, 938, 3155, 286, 1528, 11, 257
198, 464, 968, 8221, 2732, 286, 15198, 318
464, 968, 1971, 12056, 423, 257, 649, 1182
464, 968, 1971, 3782, 468, 3199, 663, 5079
818, 257, 1445, 326, 481, 1884, 787, 340
464, 968, 1971, 12056, 6, 5859, 41683, 423
198, 198, 464, 5398, 4332, 628, 628, 198
464, 717, 640, 314, 2497, 262, 3807, 11
198
4 changes: 2 additions & 2 deletions examples/cpp/llama/llama_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/

[request]
beam_width=1 # beam width for beam search
top_k=1 ; k value for top k sampling
top_k=0 ; k value for top k sampling
top_p=0.0 ; p value for top p sampling
temperature=1.0 ; Use for sampling
repetition_penalty=1.0 ; Use for sampling
presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
len_penalty=0.0
beam_search_diversity_rate=0.0
request_batch_size=8 # determine by the request
request_batch_size=2 # determine by the request
request_output_len=32 # determine by the request

[llama_7b]
Expand Down
42 changes: 21 additions & 21 deletions examples/cpp/llama/llama_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ int main(int argc, char* argv[])
ini_name = std::string(argv[1]);
}
else {
ini_name = "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
ini_name = "/notebooks/tmp/FasterTransformer/examples/cpp/llama/llama_config.ini";
}

INIReader reader = INIReader(ini_name);
Expand Down Expand Up @@ -196,7 +196,7 @@ void llama_example(const INIReader reader)
max_input_len,
end_id,
1,
"/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
"/notebooks/tmp/FasterTransformer/examples/cpp/llama/start_ids.csv");


int* d_input_ids;
Expand Down Expand Up @@ -390,20 +390,20 @@ void llama_example(const INIReader reader)
Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, prefix_prompt_task_ids.data()}});
}

if (top_k == 0 && top_p == 0.0f) {
FT_CHECK(beam_width > 1);
input_tensors.insert({"beam_search_diversity_rate",
Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
}
else {
input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
if (top_p != 0.0f) {
input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
}
if (top_k != 0) {
input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
}
}
// if (top_k == 0 && top_p == 0.0f) {
// FT_CHECK(beam_width > 1);
// input_tensors.insert({"beam_search_diversity_rate",
// Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
// }
// else {
// input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
// if (top_p != 0.0f) {
// input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
// }
// if (top_k != 0) {
// input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
// }
// }

std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
{"output_ids",
Expand Down Expand Up @@ -456,13 +456,13 @@ void llama_example(const INIReader reader)
size_t seqLCount = request_batch_size * beam_width;
int* seqlBuf = new int[seqLCount];

size_t inLCount = request_batch_size * beam_width;
int* inlBuf = new int[inLCount];

cudaD2Hcpy(hBuf, d_output_ids, outCount);
cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount);
cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount);
printf("seqlBuf: %d\n", seqlBuf[0]);
printf("seq len: ");
for (int i=0; i<seqLCount; i++) {
printf("%d ", seqlBuf[i]);
}
printf("\n");

{
std::cout << "Writing " << outCount << " elements\n";
Expand Down
1 change: 1 addition & 0 deletions examples/cpp/llama/start_ids.csv
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
1, 18637
2 changes: 1 addition & 1 deletion examples/cpp/multi_gpu_gpt/gpt_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ shared_contexts_ratio=1.0

[request]
request_batch_size=8 ; determine by the request
request_output_len=32 ; determine by the request
request_output_len=50 ; determine by the request
return_log_probs=false ; return the output log probs and cumulative log probs.
context_log_probs=false ; include input contexts in the cumulative log probability computation.
remove_padding=true
Expand Down
3 changes: 1 addition & 2 deletions examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ int read_start_ids(size_t batch_size,
int i1 = 0;
std::vector<int> tmp_vec;
while (std::getline(lineStream, vals, ',')) {
printf("vals: %s\n", vals.c_str());
tmp_vec.push_back(std::stoi(vals));
i1++;
}
Expand Down Expand Up @@ -89,7 +88,7 @@ int read_start_ids(size_t batch_size,
for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) {
v_start_ids->push_back(tmp_start_ids[i][j]);
}
printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]);
// printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]);
v_start_lengths->push_back(tmp_start_lengths[i]);
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/fastertransformer/kernels/decoding_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ __global__ void gatherTree(gatherTreeParam param)
param.max_sequence_lengths[batch * param.beam_width + j] =
tmp_len - param.max_prefix_soft_prompt_length
- (param.max_input_length - param.max_input_without_prompt_length);
printf("a: %d b: %d\n", param.max_sequence_lengths[batch * param.beam_width + j], tmp_len);
printf("%d %d\n", param.max_input_length, param.max_input_without_prompt_length);
// update the response input length
if (update_response_input_length) {
param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len;
Expand Down
26 changes: 26 additions & 0 deletions src/fastertransformer/kernels/gpt_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,32 @@ void invokeTileGptInputs(int* tiled_input_ids,
stream);
}

__global__ void calculateNewTokenLength(int* output_lengths,
const int* input_lengths,
const int max_input_length,
const int batch_size,
const int beam_width)
{
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width;
index += blockDim.x * gridDim.x) {
output_lengths[index] -= max_input_length - input_lengths[index / beam_width];
}
}

void invokeCalculateNewTokenLength(int* output_lengths,
const int* input_lengths,
const int max_input_length,
const int batch_size,
const int beam_width,
cudaStream_t stream) {
dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256));
dim3 block(256);

calculateNewTokenLength<<<grid, block, 0, stream>>>(
output_lengths, input_lengths, max_input_length, batch_size, beam_width);
}


void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size)
{
std::vector<uint32_t> seq_len_h(batch_size);
Expand Down
7 changes: 7 additions & 0 deletions src/fastertransformer/kernels/gpt_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ void invokeTileGptInputs(int* tiled_input_ids,
const int max_input_length,
cudaStream_t stream);

void invokeCalculateNewTokenLength(int* output_lengths,
const int* input_lengths,
const int max_input_length,
const int batch_size,
const int beam_width,
cudaStream_t stream);

void invokeFindContextDups(int* shared_contexts,
int* batch_to_compact,
int* compact_to_batch,
Expand Down
5 changes: 4 additions & 1 deletion src/fastertransformer/kernels/sampling_topk_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
const int tid = threadIdx.x;
const int batch_id = blockIdx.x;
if (skip_decode != nullptr && skip_decode[batch_id]) {
printf("skip decode\n");
return;
}

Expand All @@ -245,8 +246,9 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
s_sum = 0.0f;
}
TopK_2<float> partial;

// printf("end id: %d\n", end_ids[batch_id]);
if (finished != nullptr && finished[batch_id] == true) {
printf("batch id: %d", batch_id);
ids[batch_id] = end_ids[batch_id];
return;
}
Expand Down Expand Up @@ -307,6 +309,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
if (sequence_length != nullptr && finished != nullptr) {
sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1;
finished[batch_id] = ids[batch_id] == end_ids[batch_id] ? true : false;
printf("batch %d: %d %d %d\n", batch_id, finished[batch_id], ids[batch_id], end_ids[batch_id]);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
stream_);
sync_check_cuda_error();
}

// printf("TopKSamplingLayer<T>::runSampling\n");
invokeBatchTopKSampling(
sampling_workspace_,
sampling_workspace_size_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);

printf("TopPSamplingLayer<T>::runSampling\n");
const int batch_size = output_tensors->at("output_ids").shape[1];
const int local_batch_size = input_tensors->at("logits").shape[0];
const int step = input_tensors->at("step").getVal<int>();
Expand Down
Loading