From 0c2aad7a3eae76e744978d5b79b65ac7b0958841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 1 Nov 2025 23:27:09 +0100 Subject: [PATCH 1/4] add n_embd_full to support extended embed --- include/llama.h | 1 + src/llama-context.cpp | 14 +++++++------- src/llama-graph.cpp | 4 ++-- src/llama-hparams.h | 1 + src/llama-model.cpp | 23 ++++++++++------------- src/models/qwen3vl-moe.cpp | 3 +-- src/models/qwen3vl.cpp | 5 +---- tools/mtmd/mtmd.cpp | 2 +- 8 files changed, 24 insertions(+), 29 deletions(-) diff --git a/include/llama.h b/include/llama.h index 08e2ffa34c9f6..363ec33d44c72 100644 --- a/include/llama.h +++ b/include/llama.h @@ -482,6 +482,7 @@ extern "C" { LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_embd_full(const struct llama_model * model); LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6192a36e0ee5..c9829cc87fa80 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -620,7 +620,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } - return embd + j*model.hparams.n_embd; + return embd + j*model.hparams.n_embd_full; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG @@ -808,7 +808,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd; + const int64_t n_embd = hparams.n_embd_full; const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -977,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; const int64_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd; + const int64_t n_embd = hparams.n_embd_full; // when computing embeddings, all tokens are output const bool output_all = cparams.embeddings; @@ -1276,7 +1276,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; + const auto n_embd = hparams.n_embd_full; bool has_logits = true; bool has_embd = cparams.embeddings; @@ -1340,7 +1340,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { void llama_context::output_reorder() { const uint64_t n_vocab = model.vocab.n_tokens(); - const uint64_t n_embd = model.hparams.n_embd; + const uint64_t n_embd = model.hparams.n_embd_full; for (size_t s = 0; s < output_swaps.size(); ++s) { const uint64_t i0 = output_swaps[s].i0; @@ -1883,7 +1883,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__); - const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); + const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd_full); io.write(&embd_size, sizeof(embd_size)); @@ -2135,7 +2135,7 @@ void llama_context::opt_epoch_iter( batch.logits [pos_batch] = true; } - if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { + if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_full, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f9751b3183694..eb909a86adb7d 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1142,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( // input embeddings with optional lora ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { - const int64_t n_embd = hparams.n_embd; + const int64_t n_embd = hparams.n_embd_full; auto inp = std::make_unique(); @@ -1279,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const { // return cur; //} - const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd; + const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_full; const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 539fecb3f7817..104d5190623e5 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -40,6 +40,7 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; + uint32_t n_embd_full; // main + auxiliary embeds uint32_t n_embd_features = 0; uint32_t n_layer; int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04239181c7765..1d11cc121bee7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -276,7 +276,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_IM2COL: { - const int n_embd = hparams.n_embd; + const int n_embd = hparams.n_embd_full; ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1); op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); } break; @@ -505,6 +505,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); + hparams.n_embd_full = hparams.n_embd; if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); @@ -1041,7 +1042,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { } // since vision model stacks deepstack features along feature dim // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd *= hparams.n_deepstack_layers + 1; + hparams.n_embd_full *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_QWEN3MOE: { @@ -1067,7 +1068,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { } // since vision model stacks deepstack features along feature dim // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd *= hparams.n_deepstack_layers + 1; + hparams.n_embd_full *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_PHI2: { @@ -3332,10 +3333,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_QWEN3: case LLM_ARCH_QWEN3VL: { - // for model loading, the weights only have the main embd - // so we need to divide by the number of deepstack layers + 1 - // n_embd is const int so we declare a new variable - int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -3371,10 +3368,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: { - // for model loading, the weights only have the main embd - // so we need to divide by the number of deepstack layers + 1 - // n_embd is const int so we declare a new variable - int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -6681,8 +6674,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { return ::select_buft( *pimpl->dev_layer.at(il).buft_list, [&](ggml_context * ctx) { - ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); - ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); + ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd_full); + ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd_full); return ggml_add(ctx, cur, layer_dir); }); } @@ -7329,6 +7322,10 @@ int32_t llama_model_n_embd(const llama_model * model) { return model->hparams.n_embd; } +int32_t llama_model_n_embd_full(const llama_model * model) { + return model->hparams.n_embd_full; +} + int32_t llama_model_n_layer(const llama_model * model) { return model->hparams.n_layer; } diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp index c48643c0cd140..f72f80a83768b 100644 --- a/src/models/qwen3vl-moe.cpp +++ b/src/models/qwen3vl-moe.cpp @@ -1,9 +1,8 @@ #include "models.h" llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1); + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 10b36c1f65e91..0bae52239ca94 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -1,13 +1,10 @@ #include "models.h" llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - - const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; - const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1); + const int64_t n_embd = hparams.n_embd; const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 196641dd95ef4..1ab0a3be17c15 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -151,7 +151,7 @@ struct mtmd_context { print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), media_marker (ctx_params.media_marker), - n_embd_text (llama_model_n_embd(text_model)) + n_embd_text (llama_model_n_embd_full(text_model)) { if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); From 8e28665014ffa27d7a1a927c64b1e195610d6c6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 2 Nov 2025 20:15:54 +0100 Subject: [PATCH 2/4] don't change output [no ci] --- src/llama-context.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c9829cc87fa80..c5da44aca1a6f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1276,7 +1276,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd_full; + const auto n_embd = hparams.n_embd; bool has_logits = true; bool has_embd = cparams.embeddings; @@ -1340,7 +1340,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { void llama_context::output_reorder() { const uint64_t n_vocab = model.vocab.n_tokens(); - const uint64_t n_embd = model.hparams.n_embd_full; + const uint64_t n_embd = model.hparams.n_embd; for (size_t s = 0; s < output_swaps.size(); ++s) { const uint64_t i0 = output_swaps[s].i0; @@ -1883,7 +1883,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__); - const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd_full); + const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); io.write(&embd_size, sizeof(embd_size)); From 03341fb0c92778aecdfe6e72367fddff22b2b8bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 3 Nov 2025 22:13:09 +0100 Subject: [PATCH 3/4] rename to n_embd_inp --- include/llama.h | 2 +- src/llama-context.cpp | 8 ++++---- src/llama-graph.cpp | 4 ++-- src/llama-hparams.cpp | 10 ++++++++++ src/llama-hparams.h | 4 +++- src/llama-model.cpp | 21 ++++++++------------- tools/mtmd/mtmd.cpp | 2 +- 7 files changed, 29 insertions(+), 22 deletions(-) diff --git a/include/llama.h b/include/llama.h index 363ec33d44c72..df80a350daa3c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -482,7 +482,7 @@ extern "C" { LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_embd_full(const struct llama_model * model); + LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model); LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c5da44aca1a6f..c047abefc811c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -620,7 +620,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } - return embd + j*model.hparams.n_embd_full; + return embd + j*model.hparams.n_embd_inp(); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG @@ -808,7 +808,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_full; + const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -977,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; const int64_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd_full; + const int64_t n_embd = hparams.n_embd_inp(); // when computing embeddings, all tokens are output const bool output_all = cparams.embeddings; @@ -2135,7 +2135,7 @@ void llama_context::opt_epoch_iter( batch.logits [pos_batch] = true; } - if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_full, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { + if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index eb909a86adb7d..b199e94628fff 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1142,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( // input embeddings with optional lora ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { - const int64_t n_embd = hparams.n_embd_full; + const int64_t n_embd = hparams.n_embd_inp(); auto inp = std::make_unique(); @@ -1279,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const { // return cur; //} - const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_full; + const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp(); const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 514d653844c40..8cdbaf69fc01b 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { return n_head/n_head_kv; } +uint32_t llama_hparams::n_embd_inp() const { + uint32_t n_embd_inp = n_embd; + + if (n_deepstack_layers > 0) { + n_embd_inp += n_embd * n_deepstack_layers; + } + + return n_embd_inp; +} + uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const { const uint32_t n_head_kv = this->n_head_kv(il); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 104d5190623e5..9203af83b2e32 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -40,7 +40,6 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_embd_full; // main + auxiliary embeds uint32_t n_embd_features = 0; uint32_t n_layer; int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache @@ -228,6 +227,9 @@ struct llama_hparams { uint32_t n_gqa(uint32_t il = 0) const; + // dimension of main + auxiliary input embeddings + uint32_t n_embd_inp() const; + // dimension of key embeddings across all k-v heads uint32_t n_embd_k_gqa(uint32_t il = 0) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1d11cc121bee7..c9ed3145d7aa7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_IM2COL: { - const int n_embd = hparams.n_embd_full; - ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1); + const int n_embd_inp = hparams.n_embd_inp(); + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1); op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); } break; case GGML_OP_SCALE: @@ -505,7 +505,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); - hparams.n_embd_full = hparams.n_embd; if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); @@ -1040,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } - // since vision model stacks deepstack features along feature dim - // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd_full *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_QWEN3MOE: { @@ -1066,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } - // since vision model stacks deepstack features along feature dim - // we also create a fake "n_embd" for text model to be the main embd + deepstack embds - hparams.n_embd_full *= hparams.n_deepstack_layers + 1; } break; case LLM_ARCH_PHI2: { @@ -6475,6 +6468,7 @@ void llama_model::print_info() const { if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str()); LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); @@ -6674,8 +6668,9 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { return ::select_buft( *pimpl->dev_layer.at(il).buft_list, [&](ggml_context * ctx) { - ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd_full); - ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd_full); + const int n_embd_inp = hparams.n_embd_inp(); + ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_inp); + ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_inp); return ggml_add(ctx, cur, layer_dir); }); } @@ -7322,8 +7317,8 @@ int32_t llama_model_n_embd(const llama_model * model) { return model->hparams.n_embd; } -int32_t llama_model_n_embd_full(const llama_model * model) { - return model->hparams.n_embd_full; +int32_t llama_model_n_embd_inp(const llama_model * model) { + return model->hparams.n_embd_inp(); } int32_t llama_model_n_layer(const llama_model * model) { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 1ab0a3be17c15..5737ba5270db8 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -151,7 +151,7 @@ struct mtmd_context { print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), media_marker (ctx_params.media_marker), - n_embd_text (llama_model_n_embd_full(text_model)) + n_embd_text (llama_model_n_embd_inp(text_model)) { if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); From 6faa87f33754d6c5d4af6e26df4e21ff6bc29534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 7 Nov 2025 10:54:09 +0100 Subject: [PATCH 4/4] restore n_embd where applicable --- src/llama-context.cpp | 2 +- src/llama-model.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c047abefc811c..fc01a42257c75 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -620,7 +620,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } - return embd + j*model.hparams.n_embd_inp(); + return embd + j*model.hparams.n_embd; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c9ed3145d7aa7..54b925504081a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6668,9 +6668,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { return ::select_buft( *pimpl->dev_layer.at(il).buft_list, [&](ggml_context * ctx) { - const int n_embd_inp = hparams.n_embd_inp(); - ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_inp); - ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_inp); + ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); + ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); return ggml_add(ctx, cur, layer_dir); }); }