Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image_max_tokens = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
add_opt(common_arg(
{"--image-warmup-tokens"}, "N",
"number of tokens used for warming up the image encoder, only used by vision models",
[](common_params & params, int value) {
params.image_warmup_tokens = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_WARMUP_TOKENS"));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ struct common_params {
std::vector<std::string> image; // path to image file(s)
int image_min_tokens = -1;
int image_max_tokens = -1;
int image_warmup_tokens = -1;

// finetune
struct lr_opt lr;
Expand Down
31 changes: 25 additions & 6 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,21 +205,33 @@ struct clip_hparams {
// custom value provided by user, can be undefined if not set
int32_t custom_image_min_tokens = -1;
int32_t custom_image_max_tokens = -1;
int32_t custom_image_warmup_tokens = -1;

void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
const int cur_merge = n_merge == 0 ? 1 : n_merge;
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));

// LFM2-VL doesn't call set_warmup_n_tokens, but it does call set_limit_image_tokens.
if (custom_image_warmup_tokens > 0) {
warmup_image_size = static_cast<int>(std::sqrt(custom_image_warmup_tokens * patch_area));
} else {
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
}
}

void set_warmup_n_tokens(int n_tokens) {
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
const int cur_merge = n_merge == 0 ? 1 : n_merge;
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
// TODO: support warmup size for custom token numbers
if (custom_image_warmup_tokens > 0) {
const int cur_merge = n_merge == 0 ? 1 : n_merge;
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
warmup_image_size = static_cast<int>(std::sqrt(custom_image_warmup_tokens * patch_area));
} else {
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
const int cur_merge = n_merge == 0 ? 1 : n_merge;
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
}
}
};

Expand Down Expand Up @@ -469,6 +481,9 @@ struct clip_ctx {
if (ctx_params.image_max_tokens > 0) {
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
}
if (ctx_params.image_warmup_tokens > 0) {
model.hparams.custom_image_warmup_tokens = ctx_params.image_warmup_tokens;
}

backend_ptrs.push_back(backend_cpu);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
Expand Down Expand Up @@ -2893,6 +2908,10 @@ struct clip_model_loader {
if (hparams.image_max_pixels > 0) {
LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
}

if (hparams.custom_image_warmup_tokens > 0) {
LOG_INF("%s: image_warmup_tokens: %d%s\n", __func__, hparams.custom_image_warmup_tokens, " (custom value)");
}
} else if (is_audio) {
LOG_INF("\n--- audio hparams ---\n");
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct clip_context_params {
enum clip_flash_attn_type flash_attn_type;
int image_min_tokens;
int image_max_tokens;
int image_warmup_tokens;
};

struct clip_init_result {
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ struct mtmd_cli_context {
mparams.flash_attn_type = params.flash_attn_type;
mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens;
mparams.image_warmup_tokens = params.image_warmup_tokens;

ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
if (!ctx_vision.get()) {
LOG_ERR("Failed to load vision model from %s\n", clip_path);
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ mtmd_context_params mtmd_context_params_default() {
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
/* image_warmup_tokens */ -1,
};
return params;
}
Expand Down Expand Up @@ -177,6 +178,7 @@ struct mtmd_context {
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens,
/* image_warmup_tokens */ ctx_params.image_warmup_tokens,
};

auto res = clip_init(mmproj_fname, ctx_clip_params);
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/mtmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ struct mtmd_context_params {
// limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)

int image_warmup_tokens; // number of tokens used for warmup image (default: -1 AKA hard-coded for different models)
};

MTMD_API const char * mtmd_default_marker(void);
Expand Down
1 change: 1 addition & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `--image-warmup-tokens N` | number of tokens used for warming up the image encoder, only used by vision models (default: -1)<br/>(env: LLAMA_ARG_IMAGE_WARMUP_TOKENS) |
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
Expand Down
2 changes: 2 additions & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,8 @@ struct server_context_impl {
mparams.flash_attn_type = params_base.flash_attn_type;
mparams.image_min_tokens = params_base.image_min_tokens;
mparams.image_max_tokens = params_base.image_max_tokens;
mparams.image_warmup_tokens = params_base.image_warmup_tokens;

mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
if (mctx == nullptr) {
SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
Expand Down