neevaco · sfc-gh-zhwang · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 12, 2023
@@ -67,6 +67,22 @@
         "unordered_set": "cpp",
         "future": "cpp",
         "cfenv": "cpp",
-        "typeindex": "cpp"
+        "typeindex": "cpp",
+        "variant": "cpp",
+        "__bit_reference": "cpp",
+        "__config": "cpp",
+        "__debug": "cpp",
+        "__errc": "cpp",
+        "__hash_table": "cpp",
+        "__locale": "cpp",
+        "__mutex_base": "cpp",
+        "__node_handle": "cpp",
+        "__split_buffer": "cpp",
+        "__threading_support": "cpp",
+        "__tree": "cpp",
+        "__verbose_abort": "cpp",
+        "charconv": "cpp",
+        "ios": "cpp",
+        "locale": "cpp"
     }
-}
+}
@@ -3,7 +3,7 @@ max_batch_size=8 ; Use for allocate the buffer
 max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter
 beam_width=1 ; beam width for beam search
 top_k=0 ; k value for top k sampling
-top_p=0.5 ; p value for top p sampling
+top_p=0 ; p value for top p sampling
 temperature=1.0 ; Use for sampling
 repetition_penalty=2.0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
@@ -17,12 +17,12 @@ model_name=gpt_124M
 ; model_name=gpt_175B
 ; model_name=self_defined
 ; model_dir=./models/megatron-models/c-model/6.7b/
-model_dir=models/openai-gpt-models/c-model/124m/1-gpu/
+model_dir=/notebooks/ft_gpt2/1-gpu/
 shared_contexts_ratio=1.0
 
 [request]
-request_batch_size=8    ; determine by the request
-request_output_len=32   ; determine by the request
+request_batch_size=2    ; determine by the request
+request_output_len=50   ; determine by the request
 return_log_probs=false  ; return the output log probs and cumulative log probs.
 context_log_probs=false ; include input contexts in the cumulative log probability computation.
 

@@ -343,20 +343,20 @@ void gpt_example(const INIReader reader)
         input_tensors.insert(
             {"presence_penalty", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &presence_penalty}});
     }
-    if (top_k == 0 && top_p == 0.0f) {
-        FT_CHECK(beam_width > 1);
-        input_tensors.insert({"beam_search_diversity_rate",
-                              Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
-    }
-    else {
-        input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
-        if (top_p != 0.0f) {
-            input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
-        }
-        if (top_k != 0) {
-            input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
-        }
-    }
+    // if (top_k == 0 && top_p == 0.0f) {
+    //     FT_CHECK(beam_width > 1);
+    //     input_tensors.insert({"beam_search_diversity_rate",
+    //                           Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
+    // }
+    // else {
+    //     input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
+    //     if (top_p != 0.0f) {
+    //         input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
+    //     }
+    //     if (top_k != 0) {
+    //         input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
+    //     }
+    // }
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
         {"output_ids",
@@ -439,9 +439,16 @@ void gpt_example(const INIReader reader)
             size_t outCount = total_output_len * request_batch_size * beam_width;
             int*   hBuf     = new int[outCount];
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
+            size_t seqLenCount = request_batch_size * beam_width;
+            int*   hBuf2     = new int[seqLenCount];
+            cudaD2Hcpy(hBuf2, d_sequence_lengths, seqLenCount);
 
             {
                 std::cout << "Writing " << outCount << " elements\n";
+                for (int i=0; i<seqLenCount; i++) {
+                    printf("%d ", hBuf2[i]);
+                }
+                printf("\n");
                 int zeroCount = 0;
                 for (size_t i = 0; i < outCount; i++) {
                     if (hBuf[i] == int(0)) {
@@ -452,10 +459,8 @@ void gpt_example(const INIReader reader)
                         outFile << std::endl;
                     }
 
-                    if (i < 10) {
-                        printf("%5d ", hBuf[i]);
-                    }
-                    if ((i + 1) % (total_output_len) == 0 && i < 10) {
+                    printf("%5d ", hBuf[i]);
+                    if ((i + 1) % (total_output_len) == 0) {
                         std::cout << std::endl;
                     }
                 }

@@ -1,8 +1,2 @@
 818, 262, 938, 3155, 286, 1528, 11, 257
-198, 464, 968, 8221, 2732, 286, 15198, 318
-464, 968, 1971, 12056, 423, 257, 649, 1182
-464, 968, 1971, 3782, 468, 3199, 663, 5079
-818, 257, 1445, 326, 481, 1884, 787, 340
-464, 968, 1971, 12056, 6, 5859, 41683, 423
-198, 198, 464, 5398, 4332, 628, 628, 198
-464, 717, 640, 314, 2497, 262, 3807, 11
+198
@@ -10,14 +10,14 @@ model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/
 
 [request]
 beam_width=1 # beam width for beam search
-top_k=1 ; k value for top k sampling
+top_k=0 ; k value for top k sampling
 top_p=0.0 ; p value for top p sampling
 temperature=1.0 ; Use for sampling
 repetition_penalty=1.0 ; Use for sampling
 presence_penalty=0.0  ; Only one of repetition_penalty and presence_penalty are allowed.
 len_penalty=0.0
 beam_search_diversity_rate=0.0
-request_batch_size=8 # determine by the request
+request_batch_size=2 # determine by the request
 request_output_len=32 # determine by the request
 
 [llama_7b]

@@ -44,7 +44,7 @@ int main(int argc, char* argv[])
         ini_name = std::string(argv[1]);
     }
     else {
-        ini_name = "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
+        ini_name = "/notebooks/tmp/FasterTransformer/examples/cpp/llama/llama_config.ini";
     }
 
     INIReader reader = INIReader(ini_name);
@@ -196,7 +196,7 @@ void llama_example(const INIReader reader)
                    max_input_len,
                    end_id,
                    1,
-                   "/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
+                   "/notebooks/tmp/FasterTransformer/examples/cpp/llama/start_ids.csv");
 
 
     int* d_input_ids;
@@ -390,20 +390,20 @@ void llama_example(const INIReader reader)
              Tensor{MEMORY_CPU, TYPE_INT32, std::vector<size_t>{request_batch_size}, prefix_prompt_task_ids.data()}});
     }
 
-    if (top_k == 0 && top_p == 0.0f) {
-        FT_CHECK(beam_width > 1);
-        input_tensors.insert({"beam_search_diversity_rate",
-                              Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
-    }
-    else {
-        input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
-        if (top_p != 0.0f) {
-            input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
-        }
-        if (top_k != 0) {
-            input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
-        }
-    }
+    // if (top_k == 0 && top_p == 0.0f) {
+    //     FT_CHECK(beam_width > 1);
+    //     input_tensors.insert({"beam_search_diversity_rate",
+    //                           Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &beam_search_diversity_rate}});
+    // }
+    // else {
+    //     input_tensors.insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, std::vector<size_t>{1}, &random_seed}});
+    //     if (top_p != 0.0f) {
+    //         input_tensors.insert({"runtime_top_p", Tensor{MEMORY_CPU, TYPE_FP32, std::vector<size_t>{1}, &top_p}});
+    //     }
+    //     if (top_k != 0) {
+    //         input_tensors.insert({"runtime_top_k", Tensor{MEMORY_CPU, TYPE_UINT32, std::vector<size_t>{1}, &top_k}});
+    //     }
+    // }
 
     std::unordered_map<std::string, Tensor> output_tensors = std::unordered_map<std::string, Tensor>{
         {"output_ids",
@@ -456,13 +456,13 @@ void llama_example(const INIReader reader)
             size_t seqLCount = request_batch_size * beam_width;
             int* seqlBuf = new int[seqLCount];
 
-            size_t inLCount = request_batch_size * beam_width;
-            int* inlBuf = new int[inLCount];
-
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
             cudaD2Hcpy(seqlBuf, d_sequence_lengths, seqLCount);
-            cudaD2Hcpy(inlBuf, d_sequence_lengths, seqLCount);
-            printf("seqlBuf: %d\n", seqlBuf[0]);
+            printf("seq len: ");
+            for (int i=0; i<seqLCount; i++) {
+                printf("%d ", seqlBuf[i]);
+            }
+            printf("\n");
 
             {
                 std::cout << "Writing " << outCount << " elements\n";

@@ -1 +1,2 @@
 1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
+1, 18637
@@ -32,7 +32,7 @@ shared_contexts_ratio=1.0
 
 [request]
 request_batch_size=8    ; determine by the request
-request_output_len=32   ; determine by the request
+request_output_len=50   ; determine by the request
 return_log_probs=false  ; return the output log probs and cumulative log probs.
 context_log_probs=false ; include input contexts in the cumulative log probability computation.
 remove_padding=true

@@ -45,7 +45,6 @@ int read_start_ids(size_t            batch_size,
             int               i1 = 0;
             std::vector<int>  tmp_vec;
             while (std::getline(lineStream, vals, ',')) {
-                printf("vals: %s\n", vals.c_str());
                 tmp_vec.push_back(std::stoi(vals));
                 i1++;
             }
@@ -89,7 +88,7 @@ int read_start_ids(size_t            batch_size,
             for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) {
                 v_start_ids->push_back(tmp_start_ids[i][j]);
             }
-            printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]);
+            // printf("tmp_start_lengths[i]: %d\n", tmp_start_lengths[i]);
             v_start_lengths->push_back(tmp_start_lengths[i]);
         }
     }

@@ -527,6 +527,8 @@ __global__ void gatherTree(gatherTreeParam param)
             param.max_sequence_lengths[batch * param.beam_width + j] =
                 tmp_len - param.max_prefix_soft_prompt_length
                 - (param.max_input_length - param.max_input_without_prompt_length);
+            printf("a: %d b: %d\n", param.max_sequence_lengths[batch * param.beam_width + j], tmp_len);
+            printf("%d %d\n", param.max_input_length, param.max_input_without_prompt_length);
             // update the response input length
             if (update_response_input_length) {
                 param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len;

@@ -568,6 +568,32 @@ void invokeTileGptInputs(int*         tiled_input_ids,
                               stream);
 }
 
+__global__ void calculateNewTokenLength(int*       output_lengths,
+                                        const int* input_lengths,
+                                        const int  max_input_length,
+                                        const int  batch_size,
+                                        const int  beam_width)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width;
+         index += blockDim.x * gridDim.x) {
+        output_lengths[index] -= max_input_length - input_lengths[index / beam_width];
+    }
+}
+
+void invokeCalculateNewTokenLength(int*         output_lengths,
+                                   const int*   input_lengths,
+                                   const int    max_input_length,
+                                   const int    batch_size,
+                                   const int    beam_width,
+                                   cudaStream_t stream) {
+    dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256));
+    dim3 block(256);
+
+    calculateNewTokenLength<<<grid, block, 0, stream>>>(
+        output_lengths, input_lengths, max_input_length, batch_size, beam_width);
+}
+
+
 void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size)
 {
     std::vector<uint32_t> seq_len_h(batch_size);

@@ -121,6 +121,13 @@ void invokeTileGptInputs(int*         tiled_input_ids,
                          const int    max_input_length,
                          cudaStream_t stream);
 
+void invokeCalculateNewTokenLength(int*         output_lengths,
+                                   const int*   input_lengths,
+                                   const int    max_input_length,
+                                   const int    batch_size,
+                                   const int    beam_width,
+                                   cudaStream_t stream);
+
 void invokeFindContextDups(int*         shared_contexts,
                            int*         batch_to_compact,
                            int*         compact_to_batch,

@@ -225,6 +225,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
     const int tid      = threadIdx.x;
     const int batch_id = blockIdx.x;
     if (skip_decode != nullptr && skip_decode[batch_id]) {
+        printf("skip decode\n");
         return;
     }
 
@@ -245,8 +246,9 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
         s_sum = 0.0f;
     }
     TopK_2<float> partial;
-
+    // printf("end id: %d\n", end_ids[batch_id]);
     if (finished != nullptr && finished[batch_id] == true) {
+        printf("batch id: %d", batch_id);
         ids[batch_id] = end_ids[batch_id];
         return;
     }
@@ -307,6 +309,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
         if (sequence_length != nullptr && finished != nullptr) {
             sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1;
             finished[batch_id]        = ids[batch_id] == end_ids[batch_id] ? true : false;
+            printf("batch %d: %d %d %d\n", batch_id, finished[batch_id], ids[batch_id], end_ids[batch_id]);
         }
     }
 }

@@ -244,7 +244,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
             stream_);
         sync_check_cuda_error();
     }
-
+    // printf("TopKSamplingLayer<T>::runSampling\n");
     invokeBatchTopKSampling(
         sampling_workspace_,
         sampling_workspace_size_,

@@ -278,6 +278,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
     FT_CHECK(input_tensors->size() >= 4);
     FT_CHECK(output_tensors->size() >= 1);
 
+    printf("TopPSamplingLayer<T>::runSampling\n");
     const int batch_size       = output_tensors->at("output_ids").shape[1];
     const int local_batch_size = input_tensors->at("logits").shape[0];
     const int step             = input_tensors->at("step").getVal<int>();
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
		1, 18637