Skip to content

Commit cb33f43

Browse files
authored
fix embeddings when using CUDA (#3657)
1 parent e1675d1 commit cb33f43

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

llama.cpp

+13-6
Original file line numberDiff line numberDiff line change
@@ -5903,6 +5903,13 @@ static int llama_decode_internal(
59035903

59045904
ggml_allocr_alloc_graph(lctx.alloc, gf);
59055905

5906+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5907+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5908+
5909+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5910+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
5911+
5912+
59065913
#ifdef GGML_USE_CUBLAS
59075914
for (int i = 0; i < gf->n_leafs; i++) {
59085915
ggml_tensor * node = gf->leafs[i];
@@ -5920,6 +5927,12 @@ static int llama_decode_internal(
59205927
}
59215928

59225929
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
5930+
5931+
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
5932+
if (!lctx.embedding.empty()) {
5933+
embeddings->backend = GGML_BACKEND_CPU;
5934+
}
5935+
res->backend = GGML_BACKEND_CPU;
59235936
#endif
59245937

59255938
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -5944,12 +5957,6 @@ static int llama_decode_internal(
59445957
n_threads = 1;
59455958
}
59465959

5947-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5948-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5949-
5950-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5951-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
5952-
59535960
#if GGML_USE_MPI
59545961
const int64_t n_layer = hparams.n_layer;
59555962
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);

0 commit comments

Comments
 (0)