Skip to content

Commit 1462aa7

Browse files
authored
Merge pull request #110 from ggerganov/master
b2843
2 parents d11afd6 + e849648 commit 1462aa7

File tree

11 files changed

+116
-31
lines changed

11 files changed

+116
-31
lines changed

common/common.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
901901
params.interactive = true;
902902
return true;
903903
}
904+
if (arg == "--interactive-specials") {
905+
params.interactive_specials = true;
906+
return true;
907+
}
904908
if (arg == "--embedding") {
905909
params.embedding = true;
906910
return true;
@@ -1367,14 +1371,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
13671371
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
13681372
std::replace(arg.begin(), arg.end(), '_', '-');
13691373
}
1370-
13711374
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
13721375
throw std::invalid_argument("error: unknown argument: " + arg);
13731376
}
1374-
}
1375-
1376-
if (invalid_param) {
1377-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1377+
if (invalid_param) {
1378+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1379+
}
13781380
}
13791381

13801382
if (params.prompt_cache_all &&
@@ -1422,6 +1424,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
14221424
printf(" -h, --help show this help message and exit\n");
14231425
printf(" --version show version and build info\n");
14241426
printf(" -i, --interactive run in interactive mode\n");
1427+
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
14251428
printf(" --interactive-first run in interactive mode and wait for input right away\n");
14261429
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
14271430
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -2652,6 +2655,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
26522655
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
26532656
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
26542657
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2658+
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
26552659
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
26562660
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
26572661
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ struct gpt_params {
140140
bool random_prompt = false; // do not randomize prompt if none provided
141141
bool use_color = false; // use color to distinguish generations and inputs
142142
bool interactive = false; // interactive mode
143+
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
143144
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144145
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145146
bool prompt_cache_all = false; // save user input and generations to prompt cache

common/grammar-parser.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ namespace grammar_parser {
142142
pos++;
143143
last_sym_start = out_elements.size();
144144
while (*pos != '"') {
145+
if (!*pos) {
146+
throw std::runtime_error("unexpected end of input");
147+
}
145148
auto char_pair = parse_char(pos);
146149
pos = char_pair.second;
147150
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -156,6 +159,9 @@ namespace grammar_parser {
156159
}
157160
last_sym_start = out_elements.size();
158161
while (*pos != ']') {
162+
if (!*pos) {
163+
throw std::runtime_error("unexpected end of input");
164+
}
159165
auto char_pair = parse_char(pos);
160166
pos = char_pair.second;
161167
enum llama_gretype type = last_sym_start < out_elements.size()
@@ -164,6 +170,9 @@ namespace grammar_parser {
164170

165171
out_elements.push_back({type, char_pair.first});
166172
if (pos[0] == '-' && pos[1] != ']') {
173+
if (!pos[1]) {
174+
throw std::runtime_error("unexpected end of input");
175+
}
167176
auto endchar_pair = parse_char(pos + 1);
168177
pos = endchar_pair.second;
169178
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

convert-hf-to-gguf-update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ def download_file_with_auth(url, token, save_path):
161161
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
162162
pre_tokenizer = cfg["pre_tokenizer"]
163163
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
164+
if "ignore_merges" in cfg["model"]:
165+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
164166

165167
logger.info("")
166168

examples/llama-bench/README.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,33 @@ options:
2626
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
2727
-p, --n-prompt <n> (default: 512)
2828
-n, --n-gen <n> (default: 128)
29-
-b, --batch-size <n> (default: 512)
30-
-ctk <t>, --cache-type-k <t> (default: f16)
31-
-ctv <t>, --cache-type-v <t> (default: f16)
32-
-t, --threads <n> (default: 112)
29+
-pg <pp,tg> (default: 512,128)
30+
-b, --batch-size <n> (default: 2048)
31+
-ub, --ubatch-size <n> (default: 512)
32+
-ctk, --cache-type-k <t> (default: f16)
33+
-ctv, --cache-type-v <t> (default: f16)
34+
-t, --threads <n> (default: 16)
3335
-ngl, --n-gpu-layers <n> (default: 99)
3436
-sm, --split-mode <none|layer|row> (default: layer)
3537
-mg, --main-gpu <i> (default: 0)
3638
-nkvo, --no-kv-offload <0|1> (default: 0)
39+
-fa, --flash-attn <0|1> (default: 0)
3740
-mmp, --mmap <0|1> (default: 1)
38-
-ts, --tensor_split <ts0/ts1/..> (default: 0)
41+
--numa <distribute|isolate|numactl> (default: disabled)
42+
-embd, --embeddings <0|1> (default: 0)
43+
-ts, --tensor-split <ts0/ts1/..> (default: 0)
3944
-r, --repetitions <n> (default: 5)
4045
-o, --output <csv|json|md|sql> (default: md)
4146
-v, --verbose (default: 0)
4247
4348
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
4449
```
4550

46-
llama-bench can perform two types of tests:
51+
llama-bench can perform three types of tests:
4752

4853
- Prompt processing (pp): processing a prompt in batches (`-p`)
4954
- Text generation (tg): generating a sequence of tokens (`-n`)
55+
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
5056

5157
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
5258

examples/llama-bench/llama-bench.cpp

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
161161
}
162162
}
163163

164+
static std::string pair_str(const std::pair<int, int> & p) {
165+
static char buf[32];
166+
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
167+
return buf;
168+
}
169+
164170
struct cmd_params {
165171
std::vector<std::string> model;
166172
std::vector<int> n_prompt;
167173
std::vector<int> n_gen;
174+
std::vector<std::pair<int, int>> n_pg;
168175
std::vector<int> n_batch;
169176
std::vector<int> n_ubatch;
170177
std::vector<ggml_type> type_k;
@@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = {
188195
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
189196
/* n_prompt */ {512},
190197
/* n_gen */ {128},
198+
/* n_pg */ {{512, 128}},
191199
/* n_batch */ {2048},
192200
/* n_ubatch */ {512},
193201
/* type_k */ {GGML_TYPE_F16},
@@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
215223
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
216224
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
217225
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
226+
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
218227
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
219-
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
220-
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
221-
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
228+
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
229+
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
230+
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
222231
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
223232
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
224233
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
304313
}
305314
auto p = split<int>(argv[i], split_delim);
306315
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
316+
} else if (arg == "-pg") {
317+
if (++i >= argc) {
318+
invalid_param = true;
319+
break;
320+
}
321+
auto p = split<std::string>(argv[i], ',');
322+
if (p.size() != 2) {
323+
invalid_param = true;
324+
break;
325+
}
326+
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
307327
} else if (arg == "-b" || arg == "--batch-size") {
308328
if (++i >= argc) {
309329
invalid_param = true;
@@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
493513
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
494514
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
495515
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
516+
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
496517
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
497518
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
498519
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
@@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
632653
};
633654
instances.push_back(instance);
634655
}
656+
657+
for (const auto & n_pg : params.n_pg) {
658+
if (n_pg.first == 0 && n_pg.second == 0) {
659+
continue;
660+
}
661+
cmd_params_instance instance = {
662+
/* .model = */ m,
663+
/* .n_prompt = */ n_pg.first,
664+
/* .n_gen = */ n_pg.second,
665+
/* .n_batch = */ nb,
666+
/* .n_ubatch = */ nub,
667+
/* .type_k = */ tk,
668+
/* .type_v = */ tv,
669+
/* .n_threads = */ nt,
670+
/* .n_gpu_layers = */ nl,
671+
/* .split_mode = */ sm,
672+
/* .main_gpu = */ mg,
673+
/* .no_kv_offload= */ nkvo,
674+
/* .flash_attn = */ fa,
675+
/* .tensor_split = */ ts,
676+
/* .use_mmap = */ mmp,
677+
/* .embeddings = */ embd,
678+
};
679+
instances.push_back(instance);
680+
}
635681
}
636682

637683
return instances;
@@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
9651011
if (field == "n_gpu_layers") {
9661012
return 3;
9671013
}
1014+
if (field == "test") {
1015+
return 13;
1016+
}
9681017

9691018
int width = std::max((int)field.length(), 10);
9701019

@@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
10911140
value = test::get_backend();
10921141
} else if (field == "test") {
10931142
if (t.n_prompt > 0 && t.n_gen == 0) {
1094-
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
1143+
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
10951144
} else if (t.n_gen > 0 && t.n_prompt == 0) {
1096-
snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
1145+
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
10971146
} else {
1098-
assert(false);
1099-
exit(1);
1147+
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
11001148
}
11011149
value = buf;
11021150
} else if (field == "t/s") {
@@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
12971345
llama_kv_cache_clear(ctx);
12981346

12991347
uint64_t t_start = get_time_ns();
1348+
13001349
if (t.n_prompt > 0) {
13011350
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
13021351
}

examples/llava/llava-cli.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
189189
LOG_TEE("\n");
190190

191191
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
192+
if (!ctx_sampling) {
193+
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
194+
exit(1);
195+
}
196+
192197
std::string response = "";
193198
for (int i = 0; i < max_tgt_len; i++) {
194199
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);

examples/main/main.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,10 @@ int main(int argc, char ** argv) {
523523
}
524524

525525
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
526+
if (!ctx_sampling) {
527+
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
528+
exit(1);
529+
}
526530

527531
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
528532
// predict
@@ -879,7 +883,7 @@ int main(int argc, char ** argv) {
879883
}
880884

881885
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
882-
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
886+
const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
883887
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
884888

885889
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

ggml-metal.m

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -633,14 +633,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
633633
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
634634
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
635635
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
636-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true);
637-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true);
638-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true);
639-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true);
640-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true);
641-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true);
642-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true);
643-
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true);
636+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm);
637+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm);
638+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
639+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
640+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
641+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
642+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
643+
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
644644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
645645
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
646646
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
@@ -772,8 +772,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
772772
case GGML_OP_TIMESTEP_EMBEDDING:
773773
case GGML_OP_ARGSORT:
774774
case GGML_OP_LEAKY_RELU:
775-
case GGML_OP_FLASH_ATTN_EXT:
776775
return true;
776+
case GGML_OP_FLASH_ATTN_EXT:
777+
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
777778
case GGML_OP_MUL_MAT:
778779
case GGML_OP_MUL_MAT_ID:
779780
return ctx->support_simdgroup_reduction &&

llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3860,7 +3860,7 @@ static void llm_load_hparams(
38603860
switch (hparams.n_layer) {
38613861
case 22: model.type = e_model::MODEL_1B; break;
38623862
case 26: model.type = e_model::MODEL_3B; break;
3863-
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3863+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
38643864
case 40: model.type = e_model::MODEL_13B; break;
38653865
case 48: model.type = e_model::MODEL_34B; break;
38663866
case 60: model.type = e_model::MODEL_30B; break;

0 commit comments

Comments
 (0)