Skip to content

Commit cd36b18

Browse files
committed
Merge 'origin/master' into hipblas
2 parents afcb8fe + 1cbf561 commit cd36b18

17 files changed

+1246
-620
lines changed

.devops/tools.sh

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
2626
./quantize "$i" "${i/f16/q4_0}" q4_0
2727
fi
2828
done
29+
elif [[ $arg1 == '--server' || $arg1 == '-s' ]]; then
30+
./server $arg2
2931
else
3032
echo "Unknown command: $arg1"
3133
echo "Available commands: "
@@ -37,4 +39,6 @@ else
3739
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
3840
echo " --all-in-one (-a): Execute --convert & --quantize"
3941
echo " ex: \"/models/\" 7B"
42+
echo " --server (-s): Run a model on the server"
43+
echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
4044
fi

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ if (LLAMA_CUBLAS)
273273

274274
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
275275
if (LLAMA_CUDA_DMMV_F16)
276-
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
276+
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
277277
else()
278278
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
279279
endif()

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options.
239239
- Using `Zig`:
240240

241241
```bash
242-
zig build -Drelease-fast
242+
zig build -Doptimize=ReleaseFast
243243
```
244244

245245
### Metal Build

examples/common.cpp

+29-1
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
236236
break;
237237
}
238238
params.mirostat_tau = std::stof(argv[i]);
239+
} else if (arg == "--cfg-negative-prompt") {
240+
if (++i >= argc) {
241+
invalid_param = true;
242+
break;
243+
}
244+
params.cfg_negative_prompt = argv[i];
245+
} else if (arg == "--cfg-scale") {
246+
if (++i >= argc) {
247+
invalid_param = true;
248+
break;
249+
}
250+
params.cfg_scale = std::stof(argv[i]);
251+
} else if (arg == "--cfg-smooth-factor") {
252+
if (++i >= argc) {
253+
invalid_param = true;
254+
break;
255+
}
256+
params.cfg_smooth_factor = std::stof(argv[i]);
239257
} else if (arg == "-b" || arg == "--batch-size") {
240258
if (++i >= argc) {
241259
invalid_param = true;
@@ -469,6 +487,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
469487
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
470488
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
471489
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
490+
fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
491+
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
492+
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
493+
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
472494
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
473495
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
474496
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
@@ -535,7 +557,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
535557
return res;
536558
}
537559

538-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
560+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
539561
auto lparams = llama_context_default_params();
540562

541563
lparams.n_ctx = params.n_ctx;
@@ -551,6 +573,12 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
551573
lparams.logits_all = params.perplexity;
552574
lparams.embedding = params.embedding;
553575

576+
return lparams;
577+
}
578+
579+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
580+
auto lparams = llama_context_params_from_gpt_params(params);
581+
554582
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
555583
if (model == NULL) {
556584
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());

examples/common.h

+7
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ struct gpt_params {
4848
float mirostat_tau = 5.00f; // target entropy
4949
float mirostat_eta = 0.10f; // learning rate
5050

51+
// Classifier-Free Guidance
52+
// https://arxiv.org/abs/2306.17806
53+
std::string cfg_negative_prompt; // string to help guidance
54+
float cfg_scale = 1.f; // How strong is guidance
55+
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
56+
5157
std::string model = "models/7B/ggml-model.bin"; // model path
5258
std::string model_alias = "unknown"; // model alias
5359
std::string prompt = "";
@@ -99,6 +105,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
99105
//
100106

101107
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
108+
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
102109

103110
//
104111
// Console utils

examples/main/main.cpp

+84-4
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,16 @@ int main(int argc, char ** argv) {
109109

110110
llama_model * model;
111111
llama_context * ctx;
112+
llama_context * ctx_guidance = NULL;
112113
g_ctx = &ctx;
113114

114115
// load the model and apply lora adapter, if any
115116
std::tie(model, ctx) = llama_init_from_gpt_params(params);
117+
if (params.cfg_scale > 1.f) {
118+
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
119+
ctx_guidance = llama_new_context_with_model(model, lparams);
120+
}
121+
116122
if (model == NULL) {
117123
fprintf(stderr, "%s: error: unable to load model\n", __func__);
118124
return 1;
@@ -183,15 +189,28 @@ int main(int argc, char ** argv) {
183189
// tokenize the prompt
184190
std::vector<llama_token> embd_inp;
185191

186-
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
187-
// Add a space in front of the first character to match OG llama tokenizer behavior
188-
params.prompt.insert(0, 1, ' ');
192+
// Add a space in front of the first character to match OG llama tokenizer behavior
193+
params.prompt.insert(0, 1, ' ');
189194

195+
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
190196
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
191197
} else {
192198
embd_inp = session_tokens;
193199
}
194200

201+
// Tokenize negative prompt
202+
std::vector<llama_token> guidance_inp;
203+
int guidance_offset = 0;
204+
int original_prompt_len = 0;
205+
if (ctx_guidance) {
206+
params.cfg_negative_prompt.insert(0, 1, ' ');
207+
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
208+
209+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
210+
original_prompt_len = original_inp.size();
211+
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
212+
}
213+
195214
const int n_ctx = llama_n_ctx(ctx);
196215

197216
if ((int) embd_inp.size() > n_ctx - 4) {
@@ -258,6 +277,16 @@ int main(int argc, char ** argv) {
258277
for (int i = 0; i < (int) embd_inp.size(); i++) {
259278
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
260279
}
280+
281+
if (ctx_guidance) {
282+
fprintf(stderr, "\n");
283+
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
284+
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
285+
for (int i = 0; i < (int) guidance_inp.size(); i++) {
286+
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
287+
}
288+
}
289+
261290
if (params.n_keep > 0) {
262291
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
263292
for (int i = 0; i < params.n_keep; i++) {
@@ -334,11 +363,13 @@ int main(int argc, char ** argv) {
334363
int n_remain = params.n_predict;
335364
int n_consumed = 0;
336365
int n_session_consumed = 0;
366+
int n_past_guidance = 0;
337367

338368
// the first thing we will do is to output the prompt, so set color accordingly
339369
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
340370

341371
std::vector<llama_token> embd;
372+
std::vector<llama_token> embd_guidance;
342373

343374
// do one empty run to warm up the model
344375
{
@@ -367,11 +398,12 @@ int main(int argc, char ** argv) {
367398
// if we run out of context:
368399
// - take the n_keep first tokens from the original prompt (via n_past)
369400
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
370-
if (n_past + (int) embd.size() > n_ctx) {
401+
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
371402
const int n_left = n_past - params.n_keep;
372403

373404
// always keep the first token - BOS
374405
n_past = std::max(1, params.n_keep);
406+
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
375407

376408
// insert n_left/2 tokens at the start of embd from last_n_tokens
377409
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -412,6 +444,48 @@ int main(int argc, char ** argv) {
412444

413445
// evaluate tokens in batches
414446
// embd is typically prepared beforehand to fit within a batch, but not always
447+
448+
if (ctx_guidance) {
449+
int input_size = 0;
450+
llama_token* input_buf = NULL;
451+
452+
if (n_past_guidance < (int) guidance_inp.size()) {
453+
// Guidance context should have the same data with these modifications:
454+
//
455+
// * Replace the initial prompt
456+
// * Shift everything by guidance_offset
457+
embd_guidance = guidance_inp;
458+
if (embd.begin() + original_prompt_len < embd.end()) {
459+
embd_guidance.insert(
460+
embd_guidance.end(),
461+
embd.begin() + original_prompt_len,
462+
embd.end()
463+
);
464+
}
465+
466+
input_buf = embd_guidance.data();
467+
input_size = embd_guidance.size();
468+
//fprintf(stderr, "\n---------------------\n");
469+
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
470+
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
471+
//}
472+
//fprintf(stderr, "\n---------------------\n");
473+
} else {
474+
input_buf = embd.data();
475+
input_size = embd.size();
476+
}
477+
478+
for (int i = 0; i < input_size; i += params.n_batch) {
479+
int n_eval = std::min(input_size - i, params.n_batch);
480+
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
481+
fprintf(stderr, "%s : failed to eval\n", __func__);
482+
return 1;
483+
}
484+
485+
n_past_guidance += n_eval;
486+
}
487+
}
488+
415489
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
416490
int n_eval = (int) embd.size() - i;
417491
if (n_eval > params.n_batch) {
@@ -431,6 +505,7 @@ int main(int argc, char ** argv) {
431505
}
432506

433507
embd.clear();
508+
embd_guidance.clear();
434509

435510
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
436511
// out of user input, sample next token
@@ -473,6 +548,10 @@ int main(int argc, char ** argv) {
473548

474549
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
475550

551+
if (ctx_guidance) {
552+
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
553+
}
554+
476555
// Apply penalties
477556
float nl_logit = logits[llama_token_nl()];
478557
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -668,6 +747,7 @@ int main(int argc, char ** argv) {
668747
}
669748

670749
llama_print_timings(ctx);
750+
if (ctx_guidance) { llama_free(ctx_guidance); }
671751
llama_free(ctx);
672752
llama_free_model(model);
673753

examples/train-text-from-scratch/train-text-from-scratch.cpp

+3-11
Original file line numberDiff line numberDiff line change
@@ -1354,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
13541354
}
13551355
}
13561356

1357-
if (t->src0) {
1358-
expand(g, t->src0);
1359-
}
1360-
1361-
if (t->src1) {
1362-
expand(g, t->src1);
1363-
}
1364-
1365-
for (int i = 0; i < GGML_MAX_OPT; ++i) {
1366-
if (t->opt[i]) {
1367-
expand(g, t->opt[i]);
1357+
for (int i = 0; i < GGML_MAX_SRC; ++i) {
1358+
if (t->src[i]) {
1359+
expand(g, t->src[i]);
13681360
}
13691361
}
13701362

0 commit comments

Comments
 (0)