@@ -6578,6 +6578,215 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
65786578 return super ().modify_tensors (data_torch , name , bid )
65796579
65806580
6581+ @ModelBase .register ("Glm4MoeForCausalLM" )
6582+ class Glm4MoeModel (TextModel ):
6583+ model_arch = gguf .MODEL_ARCH .GLM4_MOE
6584+
6585+ def __init__ (self , * args , ** kwargs ):
6586+ super ().__init__ (* args , ** kwargs )
6587+ # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
6588+ self .block_count = self .hparams ["num_hidden_layers" ] + 1
6589+ self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
6590+
6591+ def set_vocab (self ):
6592+ from transformers import AutoTokenizer
6593+
6594+ tokenizer = AutoTokenizer .from_pretrained (
6595+ self .dir_model , trust_remote_code = True
6596+ )
6597+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6598+ tokens , toktypes , tokpre = self .get_vocab_base ()
6599+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6600+ self .gguf_writer .add_tokenizer_pre (tokpre )
6601+ self .gguf_writer .add_token_list (tokens )
6602+ self .gguf_writer .add_token_types (toktypes )
6603+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6604+ special_vocab ._set_special_token (
6605+ "eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
6606+ )
6607+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
6608+ special_vocab ._set_special_token ("eog" , tokenizer .get_added_vocab ()["<|user|>" ])
6609+ special_vocab ._set_special_token ("eog" , tokenizer .get_added_vocab ()["<|observation|>" ])
6610+ special_vocab ._set_special_token (
6611+ "unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
6612+ )
6613+ special_vocab ._set_special_token (
6614+ "bos" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
6615+ )
6616+ special_vocab .add_to_gguf (self .gguf_writer )
6617+
6618+ def set_gguf_parameters (self ):
6619+ super ().set_gguf_parameters ()
6620+ if (rope_dim := self .hparams .get ("head_dim" )) is None :
6621+ rope_dim = (
6622+ self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
6623+ )
6624+ self .gguf_writer .add_rope_dimension_count (
6625+ int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 ))
6626+ )
6627+
6628+ # MoE parameters
6629+ if (n_experts := self .hparams .get ("n_routed_experts" )) is not None :
6630+ self .gguf_writer .add_expert_count (n_experts )
6631+ # Note: expert_used_count is already set by parent class using num_experts_per_tok
6632+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
6633+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
6634+ if (n_shared_experts := self .hparams .get ("n_shared_experts" )) is not None :
6635+ self .gguf_writer .add_expert_shared_count (n_shared_experts )
6636+ if (first_k_dense_replace := self .hparams .get ("first_k_dense_replace" )) is not None :
6637+ self .gguf_writer .add_leading_dense_block_count (first_k_dense_replace )
6638+
6639+ # Expert gating function (sigmoid for GLM4_MOE)
6640+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
6641+
6642+ # Routed scaling factor
6643+ if (routed_scaling_factor := self .hparams .get ("routed_scaling_factor" )) is not None :
6644+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
6645+
6646+ # Normalise topk probabilities
6647+ if (norm_topk_prob := self .hparams .get ("norm_topk_prob" )) is not None :
6648+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
6649+
6650+ _experts : list [dict [str , Tensor ]] | None = None
6651+ _shared_experts : list [dict [str , Tensor ]] | None = None
6652+
6653+ def modify_tensors (
6654+ self , data_torch : Tensor , name : str , bid : int | None
6655+ ) -> Iterable [tuple [str , Tensor ]]:
6656+ if name .startswith ("model.visual." ): # ignore visual part
6657+ return []
6658+ elif name .startswith ("model.language_model." ):
6659+ name = name .replace ("language_model." , "" ) # for multimodal variants
6660+
6661+ # Handle main token embedding
6662+ if name == "model.embed_tokens.weight" :
6663+ return [(self .map_tensor_name ("token_embd.weight" ), data_torch )]
6664+
6665+ # Handle routed experts
6666+ if name .find ("mlp.experts" ) != - 1 and "shared_experts" not in name :
6667+ n_experts = self .hparams ["n_routed_experts" ]
6668+ assert bid is not None
6669+
6670+ if self ._experts is None :
6671+ self ._experts = [{} for _ in range (self .block_count )]
6672+
6673+ # Extend experts array if needed (for models where actual layers > num_hidden_layers)
6674+ while len (self ._experts ) <= bid :
6675+ self ._experts .append ({})
6676+
6677+ self ._experts [bid ][name ] = data_torch
6678+
6679+ if len (self ._experts [bid ]) >= n_experts * 3 :
6680+ tensors : list [tuple [str , Tensor ]] = []
6681+
6682+ # merge the experts into a single 3d tensor
6683+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
6684+ datas : list [Tensor ] = []
6685+
6686+ for xid in range (n_experts ):
6687+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
6688+ datas .append (self ._experts [bid ][ename ])
6689+ del self ._experts [bid ][ename ]
6690+
6691+ data_torch = torch .stack (datas , dim = 0 )
6692+ # Generate GGUF tensor names for merged experts
6693+ if w_name == "down_proj" :
6694+ new_name = f"blk.{ bid } .ffn_down_exps.weight"
6695+ elif w_name == "gate_proj" :
6696+ new_name = f"blk.{ bid } .ffn_gate_exps.weight"
6697+ elif w_name == "up_proj" :
6698+ new_name = f"blk.{ bid } .ffn_up_exps.weight"
6699+ else :
6700+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
6701+ new_name = self .map_tensor_name (merged_name )
6702+ tensors .append ((new_name , data_torch ))
6703+ return tensors
6704+ else :
6705+ return []
6706+
6707+ # Handle expert gating input (routing gate)
6708+ if ".mlp.gate.e_score_correction_bias" in name :
6709+ new_name = name .replace ("model.layers." , "blk." ).replace (
6710+ ".mlp.gate.e_score_correction_bias" , ".ffn_gate_inp.bias"
6711+ )
6712+ return [(new_name , data_torch )]
6713+ elif ".mlp.gate.weight" in name :
6714+ new_name = name .replace ("model.layers." , "blk." ).replace (
6715+ ".mlp.gate.weight" , ".ffn_gate_inp.weight"
6716+ )
6717+ return [(new_name , data_torch )]
6718+
6719+ # Handle shared expert tensors
6720+ if ".mlp.shared_experts." in name :
6721+ new_name = name .replace ("model.layers." , "blk." ).replace (".mlp.shared_experts." , ".ffn_" )
6722+ if "gate_proj" in new_name :
6723+ new_name = new_name .replace ("gate_proj" , "gate_shexp" )
6724+ elif "down_proj" in new_name :
6725+ new_name = new_name .replace ("down_proj" , "down_shexp" )
6726+ elif "up_proj" in new_name :
6727+ new_name = new_name .replace ("up_proj" , "up_shexp" )
6728+ return [(new_name , data_torch )]
6729+
6730+ # Handle regular dense FFN layers (for hybrid dense/MoE architecture)
6731+ if ".mlp." in name and "experts" not in name and "_shexp" not in name :
6732+ if "gate_proj" in name :
6733+ new_name = name .replace ("model.layers." , "blk." ).replace (
6734+ ".mlp.gate_proj.weight" , ".ffn_gate.weight"
6735+ )
6736+ elif "up_proj" in name :
6737+ new_name = name .replace ("model.layers." , "blk." ).replace (
6738+ ".mlp.up_proj.weight" , ".ffn_up.weight"
6739+ )
6740+ elif "down_proj" in name :
6741+ new_name = name .replace ("model.layers." , "blk." ).replace (
6742+ ".mlp.down_proj.weight" , ".ffn_down.weight"
6743+ )
6744+ else :
6745+ new_name = name
6746+ return [(self .map_tensor_name (new_name ), data_torch )]
6747+
6748+ # Handle special NextN tensors - preserve for future MTP support
6749+ if (
6750+ ".embed_tokens." in name
6751+ or ".shared_head." in name
6752+ or ".eh_proj." in name
6753+ or ".enorm." in name
6754+ or ".hnorm." in name
6755+ ):
6756+ new_name = name .replace ("model.layers." , "blk." ).replace ("model." , "" ).replace (".weight" , "" )
6757+ return [(new_name , data_torch )]
6758+
6759+ # GLM tensor mapping - handle directly without map_tensor_name
6760+ if ".input_layernorm." in name :
6761+ new_name = name .replace ("model.layers." , "blk." ).replace (".input_layernorm." , ".attn_norm." )
6762+ return [(new_name , data_torch )]
6763+ elif ".post_attention_layernorm." in name :
6764+ new_name = name .replace ("model.layers." , "blk." ).replace (".post_attention_layernorm." , ".ffn_norm." )
6765+ return [(new_name , data_torch )]
6766+ elif ".self_attn." in name :
6767+ # Map GLM self_attn to standard attention naming
6768+ new_name = name .replace ("model.layers." , "blk." ).replace (".self_attn." , ".attn_" )
6769+ if "q_proj" in new_name :
6770+ new_name = new_name .replace ("q_proj" , "q" )
6771+ elif "k_proj" in new_name :
6772+ new_name = new_name .replace ("k_proj" , "k" )
6773+ elif "v_proj" in new_name :
6774+ new_name = new_name .replace ("v_proj" , "v" )
6775+ elif "o_proj" in new_name :
6776+ new_name = new_name .replace ("o_proj" , "output" )
6777+ return [(new_name , data_torch )]
6778+
6779+ return super ().modify_tensors (data_torch , name , bid )
6780+
6781+ def prepare_tensors (self ):
6782+ super ().prepare_tensors ()
6783+ if self ._experts is not None :
6784+ # flatten `list[dict[str, Tensor]]` into `list[str]`
6785+ experts = [k for d in self ._experts for k in d .keys ()]
6786+ if len (experts ) > 0 :
6787+ raise ValueError (f"Unprocessed experts: { experts } " )
6788+
6789+
65816790@ModelBase .register ("GlmForCausalLM" , "ChatGLMModel" , "ChatGLMForConditionalGeneration" )
65826791class ChatGLMModel (TextModel ):
65836792 model_arch = gguf .MODEL_ARCH .CHATGLM
0 commit comments