Skip to content

Commit

Permalink
Vulkan k-quant mmq and ggml-backend offload functionality (ggerganov#…
Browse files Browse the repository at this point in the history
…6155)

* Fix Vulkan no kv offload incoherence

* Add k-quant mul mat mat shaders

* Rework working buffer allocation, reduces vram use noticeably

Clean up cpu assist code, replaced with ggml-backend offload function

* Default to all dedicated GPUs

* Add fallback for integrated GPUs if no dedicated GPUs are found

* Add debug info which device is allocating memory

* Fix Intel dequant issue

Fix validation issue

* Fix Vulkan GGML_OP_GET_ROWS implementation

* Clean up merge artifacts

* Remove Vulkan warning
  • Loading branch information
0cc4m authored and tybalex committed Apr 17, 2024
1 parent 1d1feab commit 20b2bfb
Show file tree
Hide file tree
Showing 7 changed files with 37,776 additions and 15,352 deletions.
9 changes: 0 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -636,15 +636,6 @@ Building the program with BLAS support may lead to some performance improvements
- #### Vulkan
> [!WARNING]
>
> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
>
> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
**With docker**:
You don't need to install Vulkan SDK. It will be installed inside the container.
Expand Down
52,138 changes: 37,199 additions & 14,939 deletions ggml-vulkan-shaders.hpp

Large diffs are not rendered by default.

635 changes: 328 additions & 307 deletions ggml-vulkan.cpp

Large diffs are not rendered by default.

11 changes: 0 additions & 11 deletions ggml-vulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,6 @@ extern "C" {
#define GGML_VK_MAX_DEVICES 16

GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void);

GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
GGML_API void ggml_vk_free_cpu_assist(void);

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
Expand Down
35 changes: 0 additions & 35 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif
#elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL)
Expand All @@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#endif
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif

// floating point type used to accumulate sums
Expand Down Expand Up @@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init_cpu_assist();
#endif

ggml_setup_op_has_task_pass();
Expand Down Expand Up @@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return;
}

#if defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
ggml_vk_check_results_1_cpu_assist(params, tensor);
}
#endif
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
#endif // GGML_USE_VULKAN

switch (tensor->op) {
case GGML_OP_DUP:
{
Expand Down Expand Up @@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
}
}

#ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
}
ggml_vk_preallocate_buffers_cpu_assist();

for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
#endif

const int n_threads = cplan->n_threads;

struct ggml_compute_state_shared state_shared = {
Expand Down Expand Up @@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
}
}

#ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup_cpu_assist();
#endif

// performance stats (graph)
{
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
Expand Down
Loading

0 comments on commit 20b2bfb

Please sign in to comment.