@@ -11084,7 +11084,8 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
1108411084 // inp_pos - contains the positions
1108511085 ggml_tensor * inp_pos = build_inp_pos();
1108611086
11087- auto * inp_attn = build_attn_inp_no_cache();
11087+ // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
11088+ auto * inp_attn = build_attn_inp_kv_iswa();
1108811089
1108911090 ggml_tensor * inp_out_ids = build_inp_out_ids();
1109011091
@@ -18632,7 +18633,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1863218633 case LLM_ARCH_NOMIC_BERT_MOE:
1863318634 case LLM_ARCH_NEO_BERT:
1863418635 case LLM_ARCH_WAVTOKENIZER_DEC:
18635- case LLM_ARCH_GEMMA_EMBEDDING:
18636+ // case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
1863618637 case LLM_ARCH_DREAM:
1863718638 case LLM_ARCH_LLADA:
1863818639 {
@@ -18681,6 +18682,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1868118682 /* attn_kv_size */ cparams.n_ctx,
1868218683 /* attn_n_pad */ padding,
1868318684 /* attn_n_swa */ hparams.n_swa,
18685+ /* attn_swa_type */ hparams.swa_type,
1868418686 /* recurrent_type_k */ GGML_TYPE_F32,
1868518687 /* recurrent_type_v */ GGML_TYPE_F32,
1868618688 /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
@@ -18750,6 +18752,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1875018752 cparams.n_seq_max,
1875118753 padding,
1875218754 hparams.n_swa,
18755+ hparams.swa_type,
1875318756 nullptr,
1875418757 nullptr);
1875518758 }
0 commit comments