Skip to content

Trace output distributions to a log file #246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,7 @@ const char * llama_print_system_info(void) {
int main(int argc, char ** argv) {
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
log_file = fopen("out.log", "w");

gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
Expand Down Expand Up @@ -880,7 +881,7 @@ int main(int argc, char ** argv) {

// tokenize the reverse prompt
std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;

for (auto antiprompt : params.antiprompt) {
antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
}
Expand Down Expand Up @@ -960,6 +961,15 @@ int main(int argc, char ** argv) {
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();

if(log_file) {
std::string intokdbg = vocab.id_to_token.at(embd[0]);
for(int i = 1; i < embd.size(); i++) {
intokdbg += '|';
intokdbg += vocab.id_to_token.at(embd[i]);
}
logprintf("\nin:'%s' n_past=%d, remaining_tokens=%d, embd.size()=%zu, embd_inp.size()=%zu\n",
intokdbg.c_str(), n_past, remaining_tokens, embd.size(), embd_inp.size());
}
if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
fprintf(stderr, "Failed to predict\n");
return 1;
Expand Down Expand Up @@ -990,7 +1000,7 @@ int main(int argc, char ** argv) {
logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
}

id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
id = sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);

last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
Expand All @@ -1007,6 +1017,11 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget
--remaining_tokens;
} else {
if(log_file) {
const int n_vocab = model.hparams.n_vocab;
const float temp = params.temp;
print_output(vocab, logits.data() + (logits.size() - n_vocab), temp);
}
// some user input remains from prompt or interaction, forward it to processing
while (embd_inp.size() > input_consumed) {
embd.push_back(embd_inp[input_consumed]);
Expand Down Expand Up @@ -1079,6 +1094,7 @@ int main(int argc, char ** argv) {
}
is_interacting = false;
}
if (log_file) fflush(log_file);

// end of text token
if (embd.back() == EOS_TOKEN_ID) {
Expand All @@ -1096,6 +1112,7 @@ int main(int argc, char ** argv) {
is_interacting = true;
}
}
logprintf("exit: remaining_tokens=%d n_past=%d goal=%lu n_predict=%d\n", remaining_tokens, n_past, embd_inp.size() + params.n_predict, params.n_predict);

#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
Expand All @@ -1114,6 +1131,7 @@ int main(int argc, char ** argv) {
}

ggml_free(model.ctx);
if (log_file) fclose(log_file);

if (params.use_color) {
printf(ANSI_COLOR_RESET);
Expand Down
238 changes: 174 additions & 64 deletions utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <alloca.h>
#endif

FILE * log_file = NULL;

bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
Expand Down Expand Up @@ -456,104 +458,212 @@ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
return true;
}

struct SoftMaxSampler {
std::vector<std::pair<double, gpt_vocab::id>> logits_id; // Set by reset, sorted by soft_max
std::vector<double> probs; // Set by compute_probs

void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});

logits_id.resize(top_k);
}

gpt_vocab::id llama_sample_top_p_top_k(
// Scales loggits (temp, repeat penalty), then computes probas and sort them.
void reset(
const gpt_vocab & vocab,
const float * logits,
std::vector<gpt_vocab::id> & last_n_tokens,
double repeat_penalty,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();

std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
const std::vector<gpt_vocab::id> & last_n_tokens,
double repeat_penalty
) {
const int n_logits = vocab.id_to_token.size();
if (repeat_penalty == 1 || n_logits == 0) {
reset(vocab, logits, temp);
return;
}
logits_id.clear();
logits_id.reserve(n_logits);

{
const double scale = 1.0/temp;
const double scale_norepeat = 1 / temp;
const double scale_repeat_neg = scale_norepeat * repeat_penalty;
const double scale_repeat_pos = scale_norepeat / repeat_penalty;
for (int i = 0; i < n_logits; ++i) {
// repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
double scale = scale_norepeat;
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (logits[i] < 0.0) {
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
}
} else {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
scale = logits[i] > 0. ? scale_repeat_pos : scale_repeat_neg;
}
logits_id.push_back(std::make_pair(logits[i] * scale, i));
}
}

sample_top_k(logits_id, top_k);
void reset(
const gpt_vocab & vocab,
const float * logits,
double temp
) {
const int n_logits = vocab.id_to_token.size();

logits_id.clear();
logits_id.reserve(n_logits);

double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
const double scale = 1.0 / temp;
for (int i = 0; i < n_logits; ++i) {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
}
}

// compute probs for the top K tokens
std::vector<double> probs;
probs.reserve(logits_id.size());
void soft_max() {
const size_t n = logits_id.size();
probs.clear();
probs.reserve(n);

double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}

// compute probs for the tokens
double sum_p = 0.0;
double sum_act = 0.0;
double entropy = 0.0;
for (const auto & kv : logits_id) {
sum_act += kv.first;
double logp = kv.first - maxl;
double p = exp(logp);
probs.push_back(p);
sum_p += p;
entropy -= p * logp;
}

// normalize the probs
const double scale = 1.0 / sum_p;
entropy = entropy * scale + log(sum_p);
for (auto & p : probs) {
p *= scale;
}

// Scaled activations stats & distribution info
logprintf( "%s: top_sact=%f mean_sact=%f top_p=%f entropy=%f\n", __func__, logits_id[0].first, sum_act / n, probs[0], entropy);
}


// Finds and computes the probabilities of the top K tokens
void top_k_sort(int top_k=0) {
if (top_k > 0 && top_k < logits_id.size()) {
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
} else {
std::sort(
logits_id.begin(),
logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
}
}

double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
probs.push_back(p);
sum += p;
int size() const {
return logits_id.size();
}

// normalize the probs
for (auto & p : probs) {
p /= sum;
std::discrete_distribution<> top_k() const {
return std::discrete_distribution<>(probs.begin(), probs.end());
}

if (top_p < 1.0f) {
std::discrete_distribution<> top_p(double top_p) const {
if (top_p >= 1.0f) {
return top_k();
}
int n = 1;
double cumsum = 0.0f;
for (int i = 0; i < (int) probs.size(); i++) {
for (int i = 0; i < probs.size(); i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
probs.resize(i + 1);
logits_id.resize(i + 1);
n = i + 1;
break;
}
}
logprintf("%s: n: %d sum: %f\n", __func__, n, cumsum);

// discrete_distribution renormalizes the subset of probabilities to sum to 1.0
return std::discrete_distribution<>(probs.begin(), probs.begin() + n);
}

gpt_vocab::id top() {
return logits_id[0].second;
}

gpt_vocab::id sample(
std::discrete_distribution<> & dist,
std::mt19937 & rng
) const {
return logits_id[dist(rng)].second;
}

cumsum = 1.0/cumsum;
for (int i = 0; i < (int) probs.size(); i++) {
probs[i] *= cumsum;
void print(FILE* log_file, const gpt_vocab & vocab, const float * logits, int max_print, int selected=-1) const {
if (log_file == nullptr) {
return;
}
int n = probs.size();
if (n > max_print) {
n = max_print;
}
for (int i = 0; i < n; i++) {
const auto& entry = logits_id[i];
const int id = entry.second;
const double scaled_logit = entry.first;
fprintf(log_file, "%s%d: '%s' p=%f act=%.3f temp=%.2f\n",
selected >= 0 && id == selected ? "->" : " ",
i,
vocab.id_to_token.at(id).c_str(),
probs[i],
logits[id],
logits[id] / scaled_logit
);
}
}
};

//printf("\n");
//for (int i = 0; i < (int) 10; i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
//}
//printf("\n\n");
//exit(0);
gpt_vocab::id sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
std::vector<gpt_vocab::id> & last_n_tokens,
double repeat_penalty,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {

SoftMaxSampler probs;
probs.reset(vocab, logits, temp, last_n_tokens, repeat_penalty);
probs.top_k_sort(top_k);
probs.soft_max();
auto dist = probs.top_p(top_p);
int sampled_tok_id = probs.sample(dist, rng);

std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
probs.print(log_file, vocab, logits, 16, sampled_tok_id);

return logits_id[idx].second;
return sampled_tok_id;
}

gpt_vocab::id print_output(
const gpt_vocab & vocab,
const float * logits,
double temp) {
SoftMaxSampler probs;
probs.reset(vocab, logits, temp);
probs.top_k_sort();
probs.soft_max();

probs.print(log_file, vocab, logits, 16);

return probs.top();
}



size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
Expand Down Expand Up @@ -623,7 +733,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
Expand All @@ -646,7 +756,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t

*(float *) pd = d;
*(float *) pm = min;
pd += bs;
pd += bs;
pm += bs;

for (int l = 0; l < qk; l += 2) {
Expand Down
Loading