Skip to content

Commit 69c505e

Browse files
committed
MPT : clone wte to output at load time
1 parent 1d19f80 commit 69c505e

File tree

4 files changed

+36
-9
lines changed

4 files changed

+36
-9
lines changed

convert-hf-to-gguf.py

-5
Original file line numberDiff line numberDiff line change
@@ -462,11 +462,6 @@ def write_tensors(self):
462462

463463
self.gguf_writer.add_tensor(new_name, data)
464464

465-
# note: MPT output is tied to (same as) wte in original model;
466-
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
467-
if new_name == "token_embd.weight":
468-
self.gguf_writer.add_tensor("output.weight", data)
469-
470465

471466
class BaichuanModel(Model):
472467
def set_vocab(self):

ggml.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -18278,10 +18278,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
1827818278
// the ggml_tensor structs to the appropriate locations in the binary blob
1827918279

1828018280
// compute the exact size needed for the new ggml_context
18281+
int n_tensors = ctx->header.n_tensors + params.extra_tensors;
1828118282
const size_t mem_size =
1828218283
params.no_alloc ?
18283-
(ctx->header.n_tensors )*ggml_tensor_overhead() :
18284-
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
18284+
(n_tensors )*ggml_tensor_overhead() :
18285+
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
1828518286

1828618287
struct ggml_init_params pdata = {
1828718288
.mem_size = mem_size,
@@ -18590,6 +18591,10 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
1859018591
return ctx->infos[i].offset;
1859118592
}
1859218593

18594+
void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
18595+
ctx->infos[i].offset = offset;
18596+
}
18597+
1859318598
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
1859418599
return ctx->infos[i].name.data;
1859518600
}

ggml.h

+2
Original file line numberDiff line numberDiff line change
@@ -2010,6 +2010,7 @@ extern "C" {
20102010

20112011
// if not NULL, create a ggml_context and allocate the tensor data in it
20122012
struct ggml_context ** ctx;
2013+
int extra_tensors;
20132014
};
20142015

20152016
GGML_API struct gguf_context * gguf_init_empty(void);
@@ -2053,6 +2054,7 @@ extern "C" {
20532054
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
20542055
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
20552056
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2057+
GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
20562058
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
20572059

20582060
// overrides existing values or adds a new one

llama.cpp

+27-2
Original file line numberDiff line numberDiff line change
@@ -1792,8 +1792,9 @@ struct llama_model_loader {
17921792

17931793
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
17941794
struct gguf_init_params params = {
1795-
/*.no_alloc = */ true,
1796-
/*.ctx = */ &ctx_meta,
1795+
/*.no_alloc = */ true,
1796+
/*.ctx = */ &ctx_meta,
1797+
/*.extra_tensors = */ 1,
17971798
};
17981799

17991800
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
@@ -2100,6 +2101,25 @@ struct llama_model_loader {
21002101
done_size += ggml_nbytes(cur);
21012102
}
21022103
}
2104+
2105+
// must be called before calc_sizes
2106+
void clone_tensor(const char * src_name, const char * dst_name) {
2107+
int src_idx = gguf_find_tensor(ctx_gguf, src_name);
2108+
GGML_ASSERT(src_idx >= 0);
2109+
2110+
struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
2111+
size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);
2112+
2113+
struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
2114+
GGML_ASSERT(cur);
2115+
2116+
ggml_set_name(cur, dst_name);
2117+
gguf_add_tensor(ctx_gguf, cur);
2118+
gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
2119+
n_tensors++;
2120+
n_elements += ggml_nelements(cur);
2121+
n_bytes += ggml_nbytes(cur);
2122+
}
21032123
};
21042124

21052125
//
@@ -2666,6 +2686,11 @@ static void llm_load_tensors(
26662686

26672687
model.n_gpu_layers = n_gpu_layers;
26682688

2689+
// MPT output is tied to (same as) wte in original model
2690+
if (model.arch == LLM_ARCH_MPT) {
2691+
ml.clone_tensor("token_embd.weight", "output.weight");
2692+
}
2693+
26692694
size_t ctx_size;
26702695
size_t mmapped_size;
26712696

0 commit comments

Comments
 (0)