Skip to content

Commit cf09c8b

Browse files
committed
Revert "MPT : clone wte to output at load time"
It seems like upstream isn't interested in this change for the time being [1], and we are going to break compatiblity with Nomic's previous conversion of MPT because of changes to the BPE tokenizer [2], so let's remove this change to minimize the diff. This reverts commit 69c505e. [1] ggml-org#3626 [2] ggml-org#3252
1 parent cc4db79 commit cf09c8b

File tree

4 files changed

+9
-36
lines changed

4 files changed

+9
-36
lines changed

convert-hf-to-gguf.py

+5
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,11 @@ def write_tensors(self):
462462

463463
self.gguf_writer.add_tensor(new_name, data)
464464

465+
# note: MPT output is tied to (same as) wte in original model;
466+
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
467+
if new_name == "token_embd.weight":
468+
self.gguf_writer.add_tensor("output.weight", data)
469+
465470

466471
class BaichuanModel(Model):
467472
def set_vocab(self):

ggml.c

+2-7
Original file line numberDiff line numberDiff line change
@@ -18278,11 +18278,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
1827818278
// the ggml_tensor structs to the appropriate locations in the binary blob
1827918279

1828018280
// compute the exact size needed for the new ggml_context
18281-
int n_tensors = ctx->header.n_tensors + params.extra_tensors;
1828218281
const size_t mem_size =
1828318282
params.no_alloc ?
18284-
(n_tensors )*ggml_tensor_overhead() :
18285-
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
18283+
(ctx->header.n_tensors )*ggml_tensor_overhead() :
18284+
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
1828618285

1828718286
struct ggml_init_params pdata = {
1828818287
.mem_size = mem_size,
@@ -18591,10 +18590,6 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
1859118590
return ctx->infos[i].offset;
1859218591
}
1859318592

18594-
void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
18595-
ctx->infos[i].offset = offset;
18596-
}
18597-
1859818593
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
1859918594
return ctx->infos[i].name.data;
1860018595
}

ggml.h

-2
Original file line numberDiff line numberDiff line change
@@ -2010,7 +2010,6 @@ extern "C" {
20102010

20112011
// if not NULL, create a ggml_context and allocate the tensor data in it
20122012
struct ggml_context ** ctx;
2013-
int extra_tensors;
20142013
};
20152014

20162015
GGML_API struct gguf_context * gguf_init_empty(void);
@@ -2054,7 +2053,6 @@ extern "C" {
20542053
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
20552054
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
20562055
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2057-
GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
20582056
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
20592057

20602058
// overrides existing values or adds a new one

llama.cpp

+2-27
Original file line numberDiff line numberDiff line change
@@ -1817,9 +1817,8 @@ struct llama_model_loader {
18171817

18181818
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
18191819
struct gguf_init_params params = {
1820-
/*.no_alloc = */ true,
1821-
/*.ctx = */ &ctx_meta,
1822-
/*.extra_tensors = */ 1,
1820+
/*.no_alloc = */ true,
1821+
/*.ctx = */ &ctx_meta,
18231822
};
18241823

18251824
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
@@ -2129,25 +2128,6 @@ struct llama_model_loader {
21292128
done_size += ggml_nbytes(cur);
21302129
}
21312130
}
2132-
2133-
// must be called before calc_sizes
2134-
void clone_tensor(const char * src_name, const char * dst_name) {
2135-
int src_idx = gguf_find_tensor(ctx_gguf, src_name);
2136-
GGML_ASSERT(src_idx >= 0);
2137-
2138-
struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
2139-
size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);
2140-
2141-
struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
2142-
GGML_ASSERT(cur);
2143-
2144-
ggml_set_name(cur, dst_name);
2145-
gguf_add_tensor(ctx_gguf, cur);
2146-
gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
2147-
n_tensors++;
2148-
n_elements += ggml_nelements(cur);
2149-
n_bytes += ggml_nbytes(cur);
2150-
}
21512131
};
21522132

21532133
//
@@ -2714,11 +2694,6 @@ static void llm_load_tensors(
27142694

27152695
model.n_gpu_layers = n_gpu_layers;
27162696

2717-
// MPT output is tied to (same as) wte in original model
2718-
if (model.arch == LLM_ARCH_MPT) {
2719-
ml.clone_tensor("token_embd.weight", "output.weight");
2720-
}
2721-
27222697
size_t ctx_size;
27232698
size_t mmapped_size;
27242699

0 commit comments

Comments
 (0)