diff --git a/common/common.cpp b/common/common.cpp index 0535508ba98df..ba1ecf0e59c8b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive = true; return true; } + if (arg == "--interactive-specials") { + params.interactive_specials = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -1367,14 +1371,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } - } - - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } } if (params.prompt_cache_all && @@ -1422,6 +1424,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -h, --help show this help message and exit\n"); printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); + printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); @@ -2652,6 +2655,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); diff --git a/common/common.h b/common/common.h index 6f00a2cca8883..d80344f2a61bb 100644 --- a/common/common.h +++ b/common/common.h @@ -140,6 +140,7 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index 2a1301569793a..fecb7cd713ea1 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -142,6 +142,9 @@ namespace grammar_parser { pos++; last_sym_start = out_elements.size(); while (*pos != '"') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); @@ -156,6 +159,9 @@ namespace grammar_parser { } last_sym_start = out_elements.size(); while (*pos != ']') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; enum llama_gretype type = last_sym_start < out_elements.size() @@ -164,6 +170,9 @@ namespace grammar_parser { out_elements.push_back({type, char_pair.first}); if (pos[0] == '-' && pos[1] != ']') { + if (!pos[1]) { + throw std::runtime_error("unexpected end of input"); + } auto endchar_pair = parse_char(pos + 1); pos = endchar_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index e6468772210f8..b5eb41eacdab7 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -161,6 +161,8 @@ def download_file_with_auth(url, token, save_path): logger.info("normalizer: " + json.dumps(normalizer, indent=4)) pre_tokenizer = cfg["pre_tokenizer"] logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) + if "ignore_merges" in cfg["model"]: + logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4)) logger.info("") diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 10f37b4418897..8578405646af7 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -26,16 +26,21 @@ options: -m, --model (default: models/7B/ggml-model-q4_0.gguf) -p, --n-prompt (default: 512) -n, --n-gen (default: 128) - -b, --batch-size (default: 512) - -ctk , --cache-type-k (default: f16) - -ctv , --cache-type-v (default: f16) - -t, --threads (default: 112) + -pg (default: 512,128) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: 16) -ngl, --n-gpu-layers (default: 99) -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) -mmp, --mmap <0|1> (default: 1) - -ts, --tensor_split (default: 0) + --numa (default: disabled) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) -r, --repetitions (default: 5) -o, --output (default: md) -v, --verbose (default: 0) @@ -43,10 +48,11 @@ options: Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` -llama-bench can perform two types of tests: +llama-bench can perform three types of tests: - Prompt processing (pp): processing a prompt in batches (`-p`) - Text generation (tg): generating a sequence of tokens (`-n`) +- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`) With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 40128ec444334..8b965e1990ba5 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) { } } +static std::string pair_str(const std::pair & p) { + static char buf[32]; + snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); + return buf; +} + struct cmd_params { std::vector model; std::vector n_prompt; std::vector n_gen; + std::vector> n_pg; std::vector n_batch; std::vector n_ubatch; std::vector type_k; @@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, /* n_gen */ {128}, + /* n_pg */ {{512, 128}}, /* n_batch */ {2048}, /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, @@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub N, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); @@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); + } else if (arg == "-pg") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], ','); + if (p.size() != 2) { + invalid_param = true; + break; + } + params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.model.empty()) { params.model = cmd_params_defaults.model; } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } + if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } @@ -632,6 +653,31 @@ static std::vector get_cmd_params_instances(const cmd_param }; instances.push_back(instance); } + + for (const auto & n_pg : params.n_pg) { + if (n_pg.first == 0 && n_pg.second == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_pg.first, + /* .n_gen = */ n_pg.second, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, + /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, + /* .embeddings = */ embd, + }; + instances.push_back(instance); + } } return instances; @@ -965,6 +1011,9 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return 3; } + if (field == "test") { + return 13; + } int width = std::max((int)field.length(), 10); @@ -1091,12 +1140,11 @@ struct markdown_printer : public printer { value = test::get_backend(); } else if (field == "test") { if (t.n_prompt > 0 && t.n_gen == 0) { - snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); + snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); } else if (t.n_gen > 0 && t.n_prompt == 0) { - snprintf(buf, sizeof(buf), "tg %d", t.n_gen); + snprintf(buf, sizeof(buf), "tg%d", t.n_gen); } else { - assert(false); - exit(1); + snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } value = buf; } else if (field == "t/s") { @@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); uint64_t t_start = get_time_ns(); + if (t.n_prompt > 0) { test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); } diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 157a680b5ecdb..da60ddf2f057d 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG_TEE("\n"); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 49acd6bab4074..9dee41001f12c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -523,6 +523,10 @@ int main(int argc, char ** argv) { } struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict @@ -879,7 +883,7 @@ int main(int argc, char ** argv) { } const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); diff --git a/ggml-metal.m b/ggml-metal.m index c6817f01f2bdb..18ce5b88a1c01 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -633,14 +633,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); @@ -772,8 +772,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: - case GGML_OP_FLASH_ATTN_EXT: return true; + case GGML_OP_FLASH_ATTN_EXT: + return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: return ctx->support_simdgroup_reduction && diff --git a/llama.cpp b/llama.cpp index e7b3fd8b433b4..2f1123d4e1678 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3860,7 +3860,7 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA + case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break; case 40: model.type = e_model::MODEL_13B; break; case 48: model.type = e_model::MODEL_34B; break; case 60: model.type = e_model::MODEL_30B; break; diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index fed3c1ee3d298..0ede9e67c2f15 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -325,8 +325,12 @@ def get_rows(properties): for row in rows_show: n_prompt = int(row[-4]) n_gen = int(row[-3]) - assert n_prompt == 0 or n_gen == 0 - test_name = f"tg{n_gen}" if n_prompt == 0 else f"pp{n_prompt}" + if n_prompt != 0 and n_gen == 0: + test_name = f"pp{n_prompt}" + elif n_prompt == 0 and n_gen != 0: + test_name = f"tg{n_gen}" + else: + test_name = f"pp{n_prompt}+tg{n_gen}" # Regular columns test name avg t/s values Speedup # VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])