Revert "MPT : clone wte to output at load time"

cebtenzzre · cebtenzzre · commit cf09c8bb93c8 · 2023-11-30T16:59:08.000-05:00
It seems like upstream isn't interested in this change for the time being [1], and we are going to break compatiblity with Nomic's previous conversion of MPT because of changes to the BPE tokenizer [2], so let's remove this change to minimize the diff. This reverts commit 69c505e. [1] ggml-org#3626 [2] ggml-org#3252
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -462,6 +462,11 @@ def write_tensors(self):
 
             self.gguf_writer.add_tensor(new_name, data)
 
+            # note: MPT output is tied to (same as) wte in original model;
+            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
+            if new_name == "token_embd.weight":
+                self.gguf_writer.add_tensor("output.weight", data)
+
 
 class BaichuanModel(Model):
     def set_vocab(self):
diff --git a/ggml.c b/ggml.c
@@ -18278,11 +18278,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         // the ggml_tensor structs to the appropriate locations in the binary blob
 
         // compute the exact size needed for the new ggml_context
-        int n_tensors = ctx->header.n_tensors + params.extra_tensors;
         const size_t mem_size =
             params.no_alloc ?
-            (n_tensors    )*ggml_tensor_overhead() :
-            (n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
+            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
+            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
 
         struct ggml_init_params pdata = {
             .mem_size   = mem_size,
@@ -18591,10 +18590,6 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
     return ctx->infos[i].offset;
 }
 
-void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
-    ctx->infos[i].offset = offset;
-}
-
 char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
     return ctx->infos[i].name.data;
 }
diff --git a/ggml.h b/ggml.h
@@ -2010,7 +2010,6 @@ extern "C" {
 
         // if not NULL, create a ggml_context and allocate the tensor data in it
         struct ggml_context ** ctx;
-        int extra_tensors;
     };
 
     GGML_API struct gguf_context * gguf_init_empty(void);
@@ -2054,7 +2053,6 @@ extern "C" {
     GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
     GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
     GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API void   gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
     GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
 
     // overrides existing values or adds a new one
diff --git a/llama.cpp b/llama.cpp
@@ -1817,9 +1817,8 @@ struct llama_model_loader {
 
     llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
         struct gguf_init_params params = {
-            /*.no_alloc      = */ true,
-            /*.ctx           = */ &ctx_meta,
-            /*.extra_tensors = */ 1,
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx_meta,
         };
 
         ctx_gguf = gguf_init_from_file(fname.c_str(), params);
@@ -2129,25 +2128,6 @@ struct llama_model_loader {
             done_size += ggml_nbytes(cur);
         }
     }
-
-    // must be called before calc_sizes
-    void clone_tensor(const char * src_name, const char * dst_name) {
-        int src_idx = gguf_find_tensor(ctx_gguf, src_name);
-        GGML_ASSERT(src_idx >= 0);
-
-        struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
-        size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);
-
-        struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
-        GGML_ASSERT(cur);
-
-        ggml_set_name(cur, dst_name);
-        gguf_add_tensor(ctx_gguf, cur);
-        gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
-        n_tensors++;
-        n_elements += ggml_nelements(cur);
-        n_bytes += ggml_nbytes(cur);
-    }
 };
 
 //
@@ -2714,11 +2694,6 @@ static void llm_load_tensors(
 
     model.n_gpu_layers = n_gpu_layers;
 
-    // MPT output is tied to (same as) wte in original model
-    if (model.arch == LLM_ARCH_MPT) {
-        ml.clone_tensor("token_embd.weight", "output.weight");
-    }
-
     size_t ctx_size;
     size_t mmapped_size;