From c770cf465968cace4b4c639516c6c3162309a3ea Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Nov 2025 23:54:17 +0100 Subject: [PATCH 1/2] mtmd: pad mask for qwen2.5vl --- tools/mtmd/clip.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0784e69fcdf93..2019238992abb 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -761,6 +761,14 @@ struct clip_graph { ggml_set_name(window_mask, "window_mask"); ggml_set_input(window_mask); + // if flash attn is used, we need to pad the mask + if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + int padded_nrow = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD); + window_mask = ggml_pad(ctx0, window_mask, + 0, padded_nrow - window_mask->ne[0], 0, 0); + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] GGML_ASSERT(batch_size == 1); inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); From 7b5d6305ca92ffa8f05f9f09f8164866bf9e806a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 3 Nov 2025 00:24:11 +0100 Subject: [PATCH 2/2] improve --- tools/mtmd/clip.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2019238992abb..60516d582a5f3 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -761,11 +761,12 @@ struct clip_graph { ggml_set_name(window_mask, "window_mask"); ggml_set_input(window_mask); - // if flash attn is used, we need to pad the mask + // if flash attn is used, we need to pad the mask and cast to f16 if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { - int padded_nrow = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD); - window_mask = ggml_pad(ctx0, window_mask, - 0, padded_nrow - window_mask->ne[0], 0, 0); + int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1]; + if (n_pad > 0) { + window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0); + } window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); }