diff --git a/common/arg.cpp b/common/arg.cpp index 9f3c8a97546..1cd87fa2de1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1824,6 +1824,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.image_max_tokens = value; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS")); + add_opt(common_arg( + {"--image-warmup-tokens"}, "N", + "number of tokens used for warming up the image encoder, only used by vision models", + [](common_params & params, int value) { + params.image_warmup_tokens = value; + } + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_WARMUP_TOKENS")); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", diff --git a/common/common.h b/common/common.h index 2f23d0baa83..8d0c5745ec7 100644 --- a/common/common.h +++ b/common/common.h @@ -433,6 +433,7 @@ struct common_params { std::vector image; // path to image file(s) int image_min_tokens = -1; int image_max_tokens = -1; + int image_warmup_tokens = -1; // finetune struct lr_opt lr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d8222d88148..c97c862c60c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -205,21 +205,33 @@ struct clip_hparams { // custom value provided by user, can be undefined if not set int32_t custom_image_min_tokens = -1; int32_t custom_image_max_tokens = -1; + int32_t custom_image_warmup_tokens = -1; void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { const int cur_merge = n_merge == 0 ? 1 : n_merge; const int patch_area = patch_size * patch_size * cur_merge * cur_merge; image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; - warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + + // LFM2-VL doesn't call set_warmup_n_tokens, but it does call set_limit_image_tokens. + if (custom_image_warmup_tokens > 0) { + warmup_image_size = static_cast(std::sqrt(custom_image_warmup_tokens * patch_area)); + } else { + warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + } } void set_warmup_n_tokens(int n_tokens) { - int n_tok_per_side = static_cast(std::sqrt(n_tokens)); - GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - const int cur_merge = n_merge == 0 ? 1 : n_merge; - warmup_image_size = n_tok_per_side * patch_size * cur_merge; - // TODO: support warmup size for custom token numbers + if (custom_image_warmup_tokens > 0) { + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + warmup_image_size = static_cast(std::sqrt(custom_image_warmup_tokens * patch_area)); + } else { + int n_tok_per_side = static_cast(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; + } } }; @@ -469,6 +481,9 @@ struct clip_ctx { if (ctx_params.image_max_tokens > 0) { model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens; } + if (ctx_params.image_warmup_tokens > 0) { + model.hparams.custom_image_warmup_tokens = ctx_params.image_warmup_tokens; + } backend_ptrs.push_back(backend_cpu); backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); @@ -2893,6 +2908,10 @@ struct clip_model_loader { if (hparams.image_max_pixels > 0) { LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : ""); } + + if (hparams.custom_image_warmup_tokens > 0) { + LOG_INF("%s: image_warmup_tokens: %d%s\n", __func__, hparams.custom_image_warmup_tokens, " (custom value)"); + } } else if (is_audio) { LOG_INF("\n--- audio hparams ---\n"); LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins); diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index c1442afe6b2..e7ee9d05f9f 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -34,6 +34,7 @@ struct clip_context_params { enum clip_flash_attn_type flash_attn_type; int image_min_tokens; int image_max_tokens; + int image_warmup_tokens; }; struct clip_init_result { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 6679de309b4..c42f069d272 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -138,6 +138,8 @@ struct mtmd_cli_context { mparams.flash_attn_type = params.flash_attn_type; mparams.image_min_tokens = params.image_min_tokens; mparams.image_max_tokens = params.image_max_tokens; + mparams.image_warmup_tokens = params.image_warmup_tokens; + ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams)); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 6690bf30046..21dd1d15e5d 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -110,6 +110,7 @@ mtmd_context_params mtmd_context_params_default() { /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, /* image_min_tokens */ -1, /* image_max_tokens */ -1, + /* image_warmup_tokens */ -1, }; return params; } @@ -177,6 +178,7 @@ struct mtmd_context { /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, /* image_min_tokens */ ctx_params.image_min_tokens, /* image_max_tokens */ ctx_params.image_max_tokens, + /* image_warmup_tokens */ ctx_params.image_warmup_tokens, }; auto res = clip_init(mmproj_fname, ctx_clip_params); diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 015119be897..938c4bf740a 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -86,6 +86,8 @@ struct mtmd_context_params { // limit number of image tokens, only for vision models with dynamic resolution int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) + + int image_warmup_tokens; // number of tokens used for warmup image (default: -1 AKA hard-coded for different models) }; MTMD_API const char * mtmd_default_marker(void); diff --git a/tools/server/README.md b/tools/server/README.md index f42bc7921c2..e79206f6dc3 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -172,6 +172,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-mmproj-offload` | do not offload multimodal projector to GPU
(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | +| `--image-warmup-tokens N` | number of tokens used for warming up the image encoder, only used by vision models (default: -1)
(env: LLAMA_ARG_IMAGE_WARMUP_TOKENS) | | `--override-tensor-draft, -otd =,...` | override tensor buffer type for draft model | | `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 2bf3924df90..41660f93485 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -623,6 +623,8 @@ struct server_context_impl { mparams.flash_attn_type = params_base.flash_attn_type; mparams.image_min_tokens = params_base.image_min_tokens; mparams.image_max_tokens = params_base.image_max_tokens; + mparams.image_warmup_tokens = params_base.image_warmup_tokens; + mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams); if (mctx == nullptr) { SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());