Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zhwang/code llama debug #23

Open
wants to merge 178 commits into
base: corvo
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
178 commits
Select commit Hold shift + click to select a range
c7330c6
commit
sfc-gh-zhwang Oct 6, 2023
d5fc856
commit
sfc-gh-zhwang Oct 6, 2023
5baf8d6
commit
sfc-gh-zhwang Oct 6, 2023
138a883
commit
sfc-gh-zhwang Oct 6, 2023
88398e2
commit
sfc-gh-zhwang Oct 6, 2023
f1de67f
commit
sfc-gh-zhwang Oct 6, 2023
98ac852
commit
sfc-gh-zhwang Oct 6, 2023
3d96715
commit
sfc-gh-zhwang Oct 6, 2023
811b19f
commit
sfc-gh-zhwang Oct 6, 2023
120f46c
commit
sfc-gh-zhwang Oct 6, 2023
d850e49
commit
sfc-gh-zhwang Oct 6, 2023
751b7fc
commit
sfc-gh-zhwang Oct 6, 2023
d2946ee
commit
sfc-gh-zhwang Oct 6, 2023
d0cb0a5
commit
sfc-gh-zhwang Oct 6, 2023
9297c43
commit
sfc-gh-zhwang Oct 6, 2023
f92c384
commit
sfc-gh-zhwang Oct 6, 2023
5330e28
commit
sfc-gh-zhwang Oct 6, 2023
e0b452a
commit
sfc-gh-zhwang Oct 6, 2023
4785e3d
commit
sfc-gh-zhwang Oct 6, 2023
5e5c611
commit
sfc-gh-zhwang Oct 6, 2023
8cda3ab
commit
sfc-gh-zhwang Oct 6, 2023
d05cc05
commit
sfc-gh-zhwang Oct 6, 2023
fb89e18
commit
sfc-gh-zhwang Oct 6, 2023
5429b2c
commit
sfc-gh-zhwang Oct 6, 2023
fb72ca1
commit
sfc-gh-zhwang Oct 6, 2023
aa9f389
commit
sfc-gh-zhwang Oct 6, 2023
ef1c1e3
commit
sfc-gh-zhwang Oct 6, 2023
7aa5709
commit
sfc-gh-zhwang Oct 6, 2023
7918ff1
commit
sfc-gh-zhwang Oct 6, 2023
2c835da
commit
sfc-gh-zhwang Oct 6, 2023
7f32af3
commit
sfc-gh-zhwang Oct 6, 2023
6a2c0ef
commit
sfc-gh-zhwang Oct 6, 2023
bc3e0cd
commit
sfc-gh-zhwang Oct 6, 2023
4fc436a
commit
sfc-gh-zhwang Oct 6, 2023
ff1342b
commit
sfc-gh-zhwang Oct 6, 2023
37ece5d
commit
sfc-gh-zhwang Oct 6, 2023
5382b47
commit
sfc-gh-zhwang Oct 6, 2023
7bbc5c3
commit
sfc-gh-zhwang Oct 6, 2023
d81dbc6
commit
sfc-gh-zhwang Oct 6, 2023
fcea4c9
commit
sfc-gh-zhwang Oct 6, 2023
c6aa32a
commit
sfc-gh-zhwang Oct 6, 2023
4467d8f
commit
sfc-gh-zhwang Oct 6, 2023
06fea96
commit
sfc-gh-zhwang Oct 6, 2023
0935a9b
commit
sfc-gh-zhwang Oct 6, 2023
fcc7de4
commit
sfc-gh-zhwang Oct 6, 2023
b7aa0aa
commit
sfc-gh-zhwang Oct 7, 2023
a38721d
commit
sfc-gh-zhwang Oct 7, 2023
d809108
commit
sfc-gh-zhwang Oct 7, 2023
8ca884c
commit
sfc-gh-zhwang Oct 7, 2023
958c727
commit
sfc-gh-zhwang Oct 7, 2023
4d10783
commit
sfc-gh-zhwang Oct 7, 2023
37267d5
commit
sfc-gh-zhwang Oct 7, 2023
e16111b
commit
sfc-gh-zhwang Oct 7, 2023
ca4e514
commit
sfc-gh-zhwang Oct 7, 2023
a03b8f6
commit
sfc-gh-zhwang Oct 7, 2023
606e8e2
commit
sfc-gh-zhwang Oct 7, 2023
7b31bd6
commit
sfc-gh-zhwang Oct 7, 2023
5e64f57
commit
sfc-gh-zhwang Oct 7, 2023
f8d1081
commit
sfc-gh-zhwang Oct 7, 2023
78d2e2c
commit
sfc-gh-zhwang Oct 7, 2023
b550406
commit
sfc-gh-zhwang Oct 7, 2023
0cbfc56
commit
sfc-gh-zhwang Oct 7, 2023
6800070
commit
sfc-gh-zhwang Oct 7, 2023
1407bbb
commit
sfc-gh-zhwang Oct 7, 2023
51081e0
commit
sfc-gh-zhwang Oct 7, 2023
0189fe0
commit
sfc-gh-zhwang Oct 7, 2023
507efd6
commit
sfc-gh-zhwang Oct 7, 2023
68acfc7
commit
sfc-gh-zhwang Oct 7, 2023
af9f946
commit
sfc-gh-zhwang Oct 7, 2023
4914ae8
commit
sfc-gh-zhwang Oct 7, 2023
b6e316c
commit
sfc-gh-zhwang Oct 7, 2023
3624a29
commit
sfc-gh-zhwang Oct 7, 2023
1960ddc
commit
sfc-gh-zhwang Oct 7, 2023
0354934
commit
sfc-gh-zhwang Oct 7, 2023
0f8342b
commit
sfc-gh-zhwang Oct 7, 2023
b64e55f
commit
sfc-gh-zhwang Oct 7, 2023
b9af956
commit
sfc-gh-zhwang Oct 7, 2023
bbfe4c0
commit
sfc-gh-zhwang Oct 7, 2023
247b86c
commit
sfc-gh-zhwang Oct 7, 2023
f53f732
commit
sfc-gh-zhwang Oct 7, 2023
59f7e45
commit
sfc-gh-zhwang Oct 7, 2023
e51c6e4
commit
sfc-gh-zhwang Oct 7, 2023
20f3cd7
commit
sfc-gh-zhwang Oct 7, 2023
e1f02f5
commit
sfc-gh-zhwang Oct 7, 2023
ac281b6
commit
sfc-gh-zhwang Oct 7, 2023
132bcb7
commit
sfc-gh-zhwang Oct 7, 2023
2f8d384
commit
sfc-gh-zhwang Oct 7, 2023
0a90c30
commit
sfc-gh-zhwang Oct 7, 2023
2288d21
commit
sfc-gh-zhwang Oct 7, 2023
02f0fac
commit
sfc-gh-zhwang Oct 7, 2023
1ea3502
commit
sfc-gh-zhwang Oct 7, 2023
4b0ad70
commit
sfc-gh-zhwang Oct 7, 2023
84e236c
commit
sfc-gh-zhwang Oct 7, 2023
3d3e290
commit
sfc-gh-zhwang Oct 7, 2023
196b145
commit
sfc-gh-zhwang Oct 7, 2023
ba68c74
commit
sfc-gh-zhwang Oct 7, 2023
f3cefd8
commit
sfc-gh-zhwang Oct 7, 2023
396f981
commit
sfc-gh-zhwang Oct 7, 2023
cad0392
commit
sfc-gh-zhwang Oct 7, 2023
20d16a2
commit
sfc-gh-zhwang Oct 7, 2023
513f227
commit
sfc-gh-zhwang Oct 7, 2023
15a2323
commit
sfc-gh-zhwang Oct 7, 2023
f4c9711
commit
sfc-gh-zhwang Oct 7, 2023
44aac65
commit
sfc-gh-zhwang Oct 7, 2023
09fca7d
commit
sfc-gh-zhwang Oct 7, 2023
e631fd7
commit
sfc-gh-zhwang Oct 7, 2023
e5f14d9
commit
sfc-gh-zhwang Oct 7, 2023
a38772c
commit
sfc-gh-zhwang Oct 7, 2023
b85faee
commit
sfc-gh-zhwang Oct 7, 2023
c3601b7
commit
sfc-gh-zhwang Oct 7, 2023
ac51023
commit
sfc-gh-zhwang Oct 7, 2023
cc25af6
commit
sfc-gh-zhwang Oct 7, 2023
f18b378
commit
sfc-gh-zhwang Oct 7, 2023
ccbcc08
commit
sfc-gh-zhwang Oct 7, 2023
db2295d
commit
sfc-gh-zhwang Oct 7, 2023
ba0d4aa
commit
sfc-gh-zhwang Oct 7, 2023
ea56edf
commit
sfc-gh-zhwang Oct 7, 2023
52da87b
commit
sfc-gh-zhwang Oct 7, 2023
1d248c5
commit
sfc-gh-zhwang Oct 7, 2023
56275c3
commit
sfc-gh-zhwang Oct 7, 2023
b6f984e
commit
sfc-gh-zhwang Oct 7, 2023
5b3bd2f
commit
sfc-gh-zhwang Oct 7, 2023
4271826
commit
sfc-gh-zhwang Oct 7, 2023
b3343d1
commit
sfc-gh-zhwang Oct 7, 2023
d64fb07
commit
sfc-gh-zhwang Oct 7, 2023
8c591a7
commit
sfc-gh-zhwang Oct 7, 2023
dd1ad8e
commit
sfc-gh-zhwang Oct 7, 2023
16ed12b
commit
sfc-gh-zhwang Oct 7, 2023
d73ba7c
commit
sfc-gh-zhwang Oct 7, 2023
24dd780
commit
sfc-gh-zhwang Oct 7, 2023
5123c23
commit
sfc-gh-zhwang Oct 7, 2023
c5b5716
commit
sfc-gh-zhwang Oct 7, 2023
6f9b25f
commit
sfc-gh-zhwang Oct 7, 2023
078a316
commit
sfc-gh-zhwang Oct 7, 2023
2477208
commit
sfc-gh-zhwang Oct 7, 2023
87fdddd
commit
sfc-gh-zhwang Oct 7, 2023
b90b511
commit
sfc-gh-zhwang Oct 7, 2023
1a84d8d
commit
sfc-gh-zhwang Oct 7, 2023
3d3296d
commit
sfc-gh-zhwang Oct 7, 2023
3735c3d
commit
sfc-gh-zhwang Oct 8, 2023
fe64ade
commit
sfc-gh-zhwang Oct 8, 2023
f454ae0
commit
sfc-gh-zhwang Oct 8, 2023
40dffed
commit
sfc-gh-zhwang Oct 8, 2023
a9c927f
commit
sfc-gh-zhwang Oct 8, 2023
4c4baeb
commit
sfc-gh-zhwang Oct 8, 2023
ad61865
commit
sfc-gh-zhwang Oct 8, 2023
9fee4c1
commit
sfc-gh-zhwang Oct 8, 2023
708cab6
commit
sfc-gh-zhwang Oct 8, 2023
6ac423f
commit
sfc-gh-zhwang Oct 8, 2023
354f194
commit
sfc-gh-zhwang Oct 8, 2023
1ac63ec
commit
sfc-gh-zhwang Oct 8, 2023
245fed4
commit
sfc-gh-zhwang Oct 8, 2023
199c598
commit
sfc-gh-zhwang Oct 8, 2023
a181428
commit
sfc-gh-zhwang Oct 8, 2023
74a3406
commit
sfc-gh-zhwang Oct 8, 2023
4c69b5c
commit
sfc-gh-zhwang Oct 8, 2023
5f385c8
commit
sfc-gh-zhwang Oct 8, 2023
60ba998
commit
sfc-gh-zhwang Oct 8, 2023
559a41e
commit
sfc-gh-zhwang Oct 8, 2023
56a27f5
commit
sfc-gh-zhwang Oct 8, 2023
5838707
commit
sfc-gh-zhwang Oct 8, 2023
19fe4c0
commit
sfc-gh-zhwang Oct 8, 2023
d3a4ebc
commit
sfc-gh-zhwang Oct 8, 2023
0311a72
commit
sfc-gh-zhwang Oct 8, 2023
a8ae9d3
commit
sfc-gh-zhwang Oct 8, 2023
8b625cf
commit
sfc-gh-zhwang Oct 8, 2023
df90770
commit
sfc-gh-zhwang Oct 8, 2023
ffeeedb
commit
sfc-gh-zhwang Oct 8, 2023
7e28798
commit
sfc-gh-zhwang Oct 8, 2023
67c0b91
commit
sfc-gh-zhwang Oct 8, 2023
c6cd12c
commit
sfc-gh-zhwang Oct 8, 2023
7b836e9
commit
sfc-gh-zhwang Oct 8, 2023
53bb8b3
commit
sfc-gh-zhwang Oct 8, 2023
1078dbb
commit
sfc-gh-zhwang Oct 8, 2023
80cc0bb
commit
sfc-gh-zhwang Oct 8, 2023
aebe644
commit
sfc-gh-zhwang Oct 8, 2023
d25b683
commit
sfc-gh-zhwang Oct 8, 2023
aab7887
commit
sfc-gh-zhwang Oct 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions examples/cpp/llama/llama_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ tensor_para_size=1
pipeline_para_size=1

model_name=llama_7b
model_dir=/notebooks/llama-2-70b-hf-ft-tp-1_llama_decoder/1/1-gpu/
model_dir=/notebooks/code-llama-34b_llama_decoder/1/1-gpu/

[request]
beam_width=1 # beam width for beam search
Expand All @@ -17,18 +17,19 @@ repetition_penalty=1.0 ; Use for sampling
presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
len_penalty=0.0
beam_search_diversity_rate=0.0
request_batch_size=8 # determine by the request
request_batch_size=1 # determine by the request
request_output_len=32 # determine by the request

[llama_7b]
head_num = 64
kv_head_num = 8
size_per_head = 128
inter_size = 28672
num_layer = 3
inter_size = 22016
num_layer = 1
rotary_embedding = 128
layernorm_eps = 1e-05
vocab_size = 32000
rope_theta = 1000000
start_id = 1
end_id = 2
weight_data_type = fp16
10 changes: 7 additions & 3 deletions examples/cpp/llama/llama_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ int main(int argc, char* argv[])
ini_name = std::string(argv[1]);
}
else {
ini_name = "/notebooks/FasterTransformer/examples/cpp/llama/llama_config.ini";
ini_name = "/notebooks/tmp/FasterTransformer/examples/cpp/llama/llama_config.ini";
}

INIReader reader = INIReader(ini_name);
Expand Down Expand Up @@ -90,6 +90,7 @@ void llama_example(const INIReader reader)
const size_t decoder_layers = reader.GetInteger(model_name, "num_layer");
const size_t rotary_embedding_dim = reader.GetInteger(model_name, "rotary_embedding");
const float rope_theta = reader.GetFloat(model_name, "rope_theta", 10000.f);
printf("rope_theta: %f\n", rope_theta);
const float layernorm_eps = reader.GetFloat(model_name, "layernorm_eps");
const int start_id = reader.GetInteger(model_name, "start_id");
const int end_id = reader.GetInteger(model_name, "end_id");
Expand Down Expand Up @@ -196,7 +197,8 @@ void llama_example(const INIReader reader)
max_input_len,
end_id,
1,
"/notebooks/FasterTransformer/examples/cpp/llama/start_ids.csv");
"/notebooks/tmp/FasterTransformer/examples/cpp/llama/start_ids_real.csv");
// "/notebooks/tmp/FasterTransformer/examples/cpp/llama/start_ids_real.csv");


int* d_input_ids;
Expand Down Expand Up @@ -475,7 +477,9 @@ void llama_example(const INIReader reader)
if ((i + 1) % (total_output_len) == 0) {
outFile << std::endl;
}
printf("%5d ", hBuf[i]);
if (i+32 >= outCount) {
printf("%d, ", hBuf[i]);
}
// if (i < 10) {
// printf("%5d ", hBuf[i]);
// }
Expand Down
2 changes: 1 addition & 1 deletion examples/cpp/llama/start_ids.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1, 18637, 29892, 526, 366, 1136, 455, 2470, 29973, 1815, 366, 5193, 304, 592, 29973
1, 518, 25580, 29962, 2266, 338, 263, 2566, 10938, 3583, 29876, 29989
1 change: 1 addition & 0 deletions examples/cpp/llama/start_ids_real.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/cpp/multi_gpu_gpt/gpt_example_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ int read_start_ids(size_t batch_size,
int i1 = 0;
std::vector<int> tmp_vec;
while (std::getline(lineStream, vals, ',')) {
printf("vals: %s\n", vals.c_str());
// printf("vals: %s\n", vals.c_str());
tmp_vec.push_back(std::stoi(vals));
i1++;
}
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/bart/utils/ft_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import torch.nn as nn
import torch.distributed as dist
import numpy as np
from transformers import MBartForConditionalGeneration, BartModel
from transformers import MBartForConditionalGeneration, BartModel, LlamaForCausalLM

class FTBartEncoderWeight(object):
def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1326,7 +1326,7 @@ inline __device__ void zero(T& dst)

inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const float rope_theta, const float t_step)
{
const float inv_freq = t_step / pow(rope_theta, zid / (float)rot_embed_dim);
const float inv_freq = t_step / pow(1000000.f, zid / (float)rot_embed_dim);
return {cos(inv_freq), sin(inv_freq)};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@
////////////////////////////////////////////////////////////////////////////////////////////////////

#define MGQA_LAUNCH_KERNEL( \
T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS, stream) \
size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK); \
T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS, stream) \
size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK); \
dim3 grid(params.num_heads, params.batch_size); \
mmha::masked_groupedquery_attention_kernel<T, \
cudaFuncSetAttribute(mmha::masked_groupedquery_attention_kernel<T, \
Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS>, \
cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz); \
mmha::masked_groupedquery_attention_kernel<T, \
Dh, \
Dh_MAX, \
THDS_PER_KEY, \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ AttentionType getAttentionType(size_t size_per_head,
// FMHA_ENABLE only affects gpt-style models (causal-mask)
char * fused_qkv = std::getenv("FMHA_ENABLE");
if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
printf("flash attention");
if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
&& (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
|| size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -728,27 +728,38 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t seq
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
// const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
// NOTE (perkzz): use sizeof(T) here for cutlass int8 kernels.
printf("local_hidden_units_: %d\n", local_hidden_units_);
const auto type_size = sizeof(T);
printf("%ld\n", type_size * 30 * batch_size * seq_len * local_hidden_units_);
printf("%ld\n", type_size * 3 * batch_size * seq_len * local_hidden_units_);
qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, type_size * 3 * batch_size * seq_len * local_hidden_units_, true);
printf("qkv_buf_\n");
if (local_kv_head_num_ != local_head_num_) {
size_t local_qkv_size = local_hidden_units_ + 2 * local_kv_head_num_ * size_per_head_;
qkv_buf_tmp_ = (T*)allocator_->reMalloc(qkv_buf_tmp_, type_size * batch_size * seq_len * local_qkv_size, true);
} else {
qkv_buf_tmp_ = qkv_buf_;
}
q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * batch_size * seq_len * 3 * local_hidden_units_, true);
printf("q_buf_2_\n");
k_buf_2_ = q_buf_2_ + batch_size * seq_len * local_hidden_units_;
v_buf_2_ = k_buf_2_ + batch_size * seq_len * local_hidden_units_;

// save memory usage when using fmha
if (allocate_qk_buf) {
printf("allocate_qk_buf\n");
auto x = sizeof(T) * batch_size * local_head_num_ * seq_len * seq_len;
printf("%ld\n", x);
qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * seq_len * seq_len, true);
}
else {
allocator_->free((void**)(&qk_buf_));
}
printf("qkv_buf_2_\n");
qkv_buf_2_ = (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * seq_len * local_hidden_units_, true);
printf("qkv_buf_2_\n");
qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, type_size * batch_size * seq_len * local_hidden_units_, true);
printf("qkv_buf_3_\n");

if (is_qk_buf_float_ == true) {
if (allocate_qk_buf) {
Expand Down
65 changes: 64 additions & 1 deletion src/fastertransformer/models/llama/Llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,23 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>* output_ten
gpt_context_decoder_->forward(
&decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
sync_check_cuda_error();
{
T* buf;
int st = 1;
for (int k=0; k<self_k_cache_shape.size(); k++) {
st *= self_k_cache_shape[k];
}
buf = new T[st];
cudaMemcpy(buf, key_cache_, sizeof(T) * st, cudaMemcpyDeviceToHost);
printf("key_cache_ at gpt_context_decoder_\n");
for (int i=max_input_length-10; i<max_input_length+2; i++) {
for (int j=0; j<8; j++) {
printf("%f ", double(buf[i*8+j]));
}
printf("\n");
}
printf("\n");
}
invokeDecodingInitialize(finished_buf_,
sequence_lengths_,
nullptr,
Expand Down Expand Up @@ -919,10 +936,34 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>* output_ten
decoder_output_buf_ + hidden_units_offset}},
{"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_shape, key_cache_}},
{"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_shape, value_cache_}}};
// for (int i=0; i< self_k_cache_shape.size(); i++) {
// printf("self_k_cache_shape: %d\n", self_k_cache_shape[i]);
// }
// for (int i=0; i< self_v_cache_shape.size(); i++) {
// printf("self_v_cache_shape: %d\n", self_v_cache_shape[i]);
// }
gpt_decoder_->forward(
&decoder_output_tensors, &decoder_input_tensors, &gpt_weights->decoder_layer_weights);
}

if (step <= max_input_length + 1) {
T* buf;
int st = 1;
for (int k=0; k<self_k_cache_shape.size(); k++) {
st *= self_k_cache_shape[k];
}
buf = new T[st];
cudaMemcpy(buf, key_cache_, sizeof(T) * st, cudaMemcpyDeviceToHost);
printf("key_cache_ at step: %d\n", step);
for (int i=max_input_length-10; i<max_input_length+2; i++) {
for (int j=0; j<8; j++) {
printf("%f ", double(buf[i*8+j]));
}
printf("\n");
}
printf("\n");
}

if (pipeline_para_.rank_ == pipeline_para_.world_size_ - 1) {
invokeGeneralT5LayerNorm(normed_decoder_output_buf_ + hidden_units_offset,
decoder_output_buf_ + hidden_units_offset,
Expand All @@ -934,6 +975,18 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>* output_ten
stream_);
sync_check_cuda_error();

// if (step == max_input_length) {
// T* buf;
// int st = hidden_units_;
// buf = new T[st];
// cudaMemcpy(buf, normed_decoder_output_buf_, sizeof(T) * st, cudaMemcpyDeviceToHost);
// printf("normed_decoder_output_buf_ at step: %d\n", step);
// for (int i=0; i<st; i++) {
// printf("%f ", double(buf[i]));
// }
// printf("\n");
// }


if (tensor_para_.world_size_ == 1) {
float alpha = 1.0f;
Expand Down Expand Up @@ -1064,7 +1117,17 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>* output_ten
}
dynamic_decode_output_tensors.insert(*t);
}

if (step == max_input_length && 0) {
float* buf;
int st = vocab_size_padded_;
buf = new float[st];
cudaMemcpy(buf, logits_buf_, sizeof(float) * st, cudaMemcpyDeviceToHost);
printf("logits_buf_ at step: %d\n", step);
for (int i=0; i<10; i++) {
printf("%f ", double(buf[i]));
}
printf("\n");
}
dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
*generation_should_stop_ &= subbatch_should_stop;
}
Expand Down
17 changes: 17 additions & 0 deletions src/fastertransformer/models/llama/LlamaContextDecoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,23 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
request_batch_size,
hidden_units_,
stream_);

// {
// T* buf;
// int st = seq_len*8192;
// buf = new T[st];
// cudaMemcpy(buf, output_tensors->at("decoder_output").getPtr<T>(), sizeof(T) * st, cudaMemcpyDeviceToHost);
// printf("decoder_output at step\n");
// for (int s=0; s<seq_len; s++) {
// printf("seq at %d:\n", s);
// for (int i=0; i<10; i++) {
// printf("%f ", double(buf[s*8192+i]));
// }
// printf("\n");
// }
// printf("last: %f\n", double(buf[st-1]));
// printf("\n");
// }
sync_check_cuda_error();
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
Expand Down
8 changes: 8 additions & 0 deletions src/fastertransformer/models/llama/LlamaDecoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,16 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
Tensor& v_cache = output_tensors->at("value_cache");
std::vector<size_t> self_k_cache_size;
self_k_cache_size.push_back(local_batch_size);
size_t a = 1;
int b = 1;
for (auto t = k_cache.shape.begin() + 2; t != k_cache.shape.end(); ++t) {
self_k_cache_size.push_back(*t);
}
for (auto t = k_cache.shape.begin(); t != k_cache.shape.end(); ++t) {
a *= *t;
b *= *t;
}
// printf("a b: %ld %d\n", a, b);
std::vector<size_t> self_v_cache_size;
self_v_cache_size.push_back(local_batch_size);
for (auto t = v_cache.shape.begin() + 2; t != v_cache.shape.end(); ++t) {
Expand Down Expand Up @@ -297,6 +304,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
ite_cache_offset *= *t;
}
cache_offset += ite_cache_offset;
// printf("cache_offset %ld ite_cache_offset %ld\n", cache_offset, ite_cache_offset);

TensorMap self_attention_output_tensors{
{"hidden_features", Tensor{MEMORY_GPU, data_type, {local_batch_size, hidden_units_}, self_attn_output_}},
Expand Down