|
6 | 6 | #include "rwkv_v3.h"
|
7 | 7 | #include "ggml.h"
|
8 | 8 |
|
9 |
| -#ifdef GGML_USE_CUBLAS |
10 |
| -#include "ggml-cuda.h" |
11 |
| -#endif |
12 |
| -#if defined(GGML_USE_CLBLAST) |
13 |
| -#include "ggml-opencl.h" |
14 |
| -#endif |
15 |
| - |
16 | 9 | #include <string>
|
17 | 10 | #include <vector>
|
18 | 11 | #include <cstring>
|
@@ -1065,11 +1058,7 @@ struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
|
1065 | 1058 | const size_t n_threads,
|
1066 | 1059 | const size_t sequence_len = 1
|
1067 | 1060 | ) {
|
1068 |
| -#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) |
1069 |
| - enum ggml_type mul_mat_type = type == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; |
1070 |
| -#else |
1071 | 1061 | enum ggml_type mul_mat_type = ggml_is_quantized(type) ? GGML_TYPE_Q8_1 : type;
|
1072 |
| -#endif |
1073 | 1062 | return ctx.alloc(GGML_TYPE_I8, rwkv_future_tensor::size(mul_mat_type, ffn_key_height, sequence_len) * n_threads + 64 * (n_threads - 1));
|
1074 | 1063 | }
|
1075 | 1064 |
|
@@ -1556,38 +1545,7 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
|
1556 | 1545 | }
|
1557 | 1546 |
|
1558 | 1547 | bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
|
1559 |
| -#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) |
1560 |
| - printf("\nRWKV: Attempting offload of %u layers",n_layers); |
1561 |
| - const auto offload = [&](struct ggml_tensor * tensor) { |
1562 |
| - // TODO support multi-GPU |
1563 |
| - tensor->backend = GGML_BACKEND_GPU; |
1564 |
| - #if defined(GGML_USE_CLBLAST) |
1565 |
| - ggml_cl_transform_tensor(tensor->data, tensor); |
1566 |
| - #else |
1567 |
| - ggml_cuda_transform_tensor(tensor->data, tensor); |
1568 |
| - #endif |
1569 |
| - }; |
1570 |
| - |
1571 |
| - const size_t n_gpu = std::min(n_layers, ctx->instance->model.header.n_layer); |
1572 |
| - |
1573 |
| - if (ctx->gpu_layers < n_gpu) { |
1574 |
| - for (size_t & i = ctx->gpu_layers; i < n_gpu; i++) { |
1575 |
| - const struct rwkv_layer & layer = ctx->instance->model.layers[i]; |
1576 |
| - |
1577 |
| - // TODO also offload other operations to GPU with ggml_cuda_assign_buffers |
1578 |
| - offload(layer.att_key); |
1579 |
| - offload(layer.att_value); |
1580 |
| - offload(layer.att_receptance); |
1581 |
| - offload(layer.att_output); |
1582 |
| - |
1583 |
| - offload(layer.ffn_key); |
1584 |
| - offload(layer.ffn_value); |
1585 |
| - offload(layer.ffn_receptance); |
1586 |
| - } |
1587 | 1548 |
|
1588 |
| - return true; |
1589 |
| - } |
1590 |
| -#endif |
1591 | 1549 | return false;
|
1592 | 1550 | }
|
1593 | 1551 |
|
|
0 commit comments