Skip to content

Commit 03a4e98

Browse files
committed
Revert the prompt processing on gpu for now.
Fixes issues ggml-org#1580 and ggml-org#1581
1 parent f038196 commit 03a4e98

File tree

1 file changed

+11
-3
lines changed

1 file changed

+11
-3
lines changed

llama.cpp

+11-3
Original file line numberDiff line numberDiff line change
@@ -3603,7 +3603,7 @@ static struct ggml_cgraph * llm_build_falcon(
36033603
ggml_build_forward_expand(gf, cur);
36043604

36053605
ggml_free(ctx0);
3606-
3606+
36073607
#if defined(GGML_USE_KOMPUTE)
36083608
if (lctx.ctx_kompute) {
36093609
if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
@@ -4147,7 +4147,7 @@ static struct ggml_cgraph * llm_build_mpt(
41474147
ggml_build_forward_expand(gf, cur);
41484148

41494149
ggml_free(ctx0);
4150-
4150+
41514151
#if defined(GGML_USE_KOMPUTE)
41524152
if (lctx.ctx_kompute) {
41534153
if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
@@ -4307,11 +4307,19 @@ static bool llama_eval_internal(
43074307
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
43084308
}
43094309
#elif defined(GGML_USE_KOMPUTE)
4310-
if (lctx.ctx_kompute) {
4310+
if (lctx.ctx_kompute && N == 1) {
43114311
ggml_vk_graph_compute(lctx.ctx_kompute, gf);
43124312
ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
43134313
} else {
4314+
if (lctx.ctx_kompute) {
4315+
ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
4316+
ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
4317+
}
43144318
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
4319+
if (lctx.ctx_kompute) {
4320+
ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
4321+
ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
4322+
}
43154323
}
43164324
#else
43174325
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);

0 commit comments

Comments
 (0)