ggml-org · SmartestWashingMachine · Dec 1, 2025
@@ -1824,6 +1824,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image_max_tokens = value;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+    add_opt(common_arg(
+        {"--image-warmup-tokens"}, "N",
+        "number of tokens used for warming up the image encoder, only used by vision models",
+        [](common_params & params, int value) {
+            params.image_warmup_tokens = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_WARMUP_TOKENS"));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",

@@ -433,6 +433,7 @@ struct common_params {
     std::vector<std::string> image; // path to image file(s)
     int image_min_tokens = -1;
     int image_max_tokens = -1;
+    int image_warmup_tokens = -1;
 
     // finetune
     struct lr_opt lr;

@@ -205,21 +205,33 @@ struct clip_hparams {
     // custom value provided by user, can be undefined if not set
     int32_t custom_image_min_tokens = -1;
     int32_t custom_image_max_tokens = -1;
+    int32_t custom_image_warmup_tokens = -1;
 
     void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
         const int cur_merge = n_merge == 0 ? 1 : n_merge;
         const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
         image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
         image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
-        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+
+        // LFM2-VL doesn't call set_warmup_n_tokens, but it does call set_limit_image_tokens.
+        if (custom_image_warmup_tokens > 0) {
+            warmup_image_size = static_cast<int>(std::sqrt(custom_image_warmup_tokens * patch_area));
+        } else {
+            warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+        }
     }
 
     void set_warmup_n_tokens(int n_tokens) {
-        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
-        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        const int cur_merge = n_merge == 0 ? 1 : n_merge;
-        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
-        // TODO: support warmup size for custom token numbers
+        if (custom_image_warmup_tokens > 0) {
+            const int cur_merge = n_merge == 0 ? 1 : n_merge;
+            const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
+            warmup_image_size = static_cast<int>(std::sqrt(custom_image_warmup_tokens * patch_area));
+        } else {
+            int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
+            GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
+            const int cur_merge = n_merge == 0 ? 1 : n_merge;
+            warmup_image_size = n_tok_per_side * patch_size * cur_merge;
+        }
     }
 };
 
@@ -469,6 +481,9 @@ struct clip_ctx {
         if (ctx_params.image_max_tokens > 0) {
             model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
         }
+        if (ctx_params.image_warmup_tokens > 0) {
+            model.hparams.custom_image_warmup_tokens = ctx_params.image_warmup_tokens;
+        }
 
         backend_ptrs.push_back(backend_cpu);
         backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
@@ -2893,6 +2908,10 @@ struct clip_model_loader {
                 if (hparams.image_max_pixels > 0) {
                     LOG_INF("%s: image_max_pixels:   %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
                 }
+
+                if (hparams.custom_image_warmup_tokens > 0) {
+                    LOG_INF("%s: image_warmup_tokens:   %d%s\n", __func__, hparams.custom_image_warmup_tokens, " (custom value)");
+                }
             } else if (is_audio) {
                 LOG_INF("\n--- audio hparams ---\n");
                 LOG_INF("%s: n_mel_bins:         %d\n", __func__, hparams.n_mel_bins);

@@ -34,6 +34,7 @@ struct clip_context_params {
     enum clip_flash_attn_type flash_attn_type;
     int image_min_tokens;
     int image_max_tokens;
+    int image_warmup_tokens;
 };
 
 struct clip_init_result {

@@ -138,6 +138,8 @@ struct mtmd_cli_context {
         mparams.flash_attn_type  = params.flash_attn_type;
         mparams.image_min_tokens = params.image_min_tokens;
         mparams.image_max_tokens = params.image_max_tokens;
+        mparams.image_warmup_tokens = params.image_warmup_tokens;
+
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);

@@ -110,6 +110,7 @@ mtmd_context_params mtmd_context_params_default() {
         /* flash_attn_type   */ LLAMA_FLASH_ATTN_TYPE_AUTO,
         /* image_min_tokens  */ -1,
         /* image_max_tokens  */ -1,
+        /* image_warmup_tokens */ -1,
     };
     return params;
 }
@@ -177,6 +178,7 @@ struct mtmd_context {
             /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
             /* image_min_tokens  */ ctx_params.image_min_tokens,
             /* image_max_tokens  */ ctx_params.image_max_tokens,
+            /* image_warmup_tokens */ ctx_params.image_warmup_tokens,
         };
 
         auto res = clip_init(mmproj_fname, ctx_clip_params);

@@ -86,6 +86,8 @@ struct mtmd_context_params {
     // limit number of image tokens, only for vision models with dynamic resolution
     int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
     int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+    int image_warmup_tokens; // number of tokens used for warmup image (default: -1 AKA hard-coded for different models)
 };
 
 MTMD_API const char * mtmd_default_marker(void);

@@ -172,6 +172,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
+| `--image-warmup-tokens N` | number of tokens used for warming up the image encoder, only used by vision models (default: -1)<br/>(env: LLAMA_ARG_IMAGE_WARMUP_TOKENS) |
 | `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
 | `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
 | `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |

@@ -623,6 +623,8 @@ struct server_context_impl {
             mparams.flash_attn_type  = params_base.flash_attn_type;
             mparams.image_min_tokens = params_base.image_min_tokens;
             mparams.image_max_tokens = params_base.image_max_tokens;
+            mparams.image_warmup_tokens = params_base.image_warmup_tokens;
+
             mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
             if (mctx == nullptr) {
                 SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());