diff --git a/convert.py b/convert.py index 5949ddbe8f301..482858e453c95 100755 --- a/convert.py +++ b/convert.py @@ -42,6 +42,7 @@ ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 + # # data types # @@ -235,6 +236,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + n_experts = None + n_experts_used = None + + if "num_local_experts" in config: + n_experts = config["num_local_experts"] + n_experts_used = config["num_experts_per_tok"] + return Params( n_vocab = config["vocab_size"], n_embd = config["hidden_size"], @@ -243,6 +251,8 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: n_ff = config["intermediate_size"], n_head = (n_head := config["num_attention_heads"]), n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, f_norm_eps = config["rms_norm_eps"], f_rope_freq_base = config.get("rope_theta"), rope_scaling_type = rope_scaling_type, @@ -257,7 +267,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_experts = None + n_experts = None n_experts_used = None f_rope_freq_base = None @@ -280,7 +290,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: if config.get("moe"): n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] + n_experts = config["moe"]["num_experts"] n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 18f75cf69eeda..0115ea1c605b1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -150,7 +150,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral ), # Feed-forward up @@ -169,7 +170,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral + "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral ), # Feed-forward gate @@ -180,7 +182,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral + "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral ), # Feed-forward down @@ -198,7 +201,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral + "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral ), MODEL_TENSOR.ATTN_Q_NORM: (