Skip to content

Commit

Permalink
convert : support safetensors format
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Dec 12, 2023
1 parent f1cbfab commit 6a419f4
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
14 changes: 12 additions & 2 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
ARCH = gguf.MODEL_ARCH.LLAMA

DEFAULT_CONCURRENCY = 8

#
# data types
#
Expand Down Expand Up @@ -235,6 +236,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")

n_experts = None
n_experts_used = None

if "num_local_experts" in config:
n_experts = config["num_local_experts"]
n_experts_used = config["num_experts_per_tok"]

return Params(
n_vocab = config["vocab_size"],
n_embd = config["hidden_size"],
Expand All @@ -243,6 +251,8 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head),
n_experts = n_experts,
n_experts_used = n_experts_used,
f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
Expand All @@ -257,7 +267,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))

n_experts = None
n_experts = None
n_experts_used = None
f_rope_freq_base = None

Expand All @@ -280,7 +290,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:

if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"]
n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6

Expand Down
12 changes: 8 additions & 4 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ class TensorNameMap:
),

MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
),

# Feed-forward up
Expand All @@ -169,7 +170,8 @@ class TensorNameMap:
),

MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
),

# Feed-forward gate
Expand All @@ -180,7 +182,8 @@ class TensorNameMap:
),

MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
),

# Feed-forward down
Expand All @@ -198,7 +201,8 @@ class TensorNameMap:
),

MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
),

MODEL_TENSOR.ATTN_Q_NORM: (
Expand Down

0 comments on commit 6a419f4

Please sign in to comment.