Skip to content

Commit ef3b8dc

Browse files
committed
GPU accel for rwkv is slow, disable it
1 parent e1a7042 commit ef3b8dc

File tree

2 files changed

+4
-46
lines changed

2 files changed

+4
-46
lines changed

gpttype_adapter.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -432,10 +432,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
432432
{
433433
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), n_threads);
434434

435-
if(inputs.gpulayers>0)
436-
{
437-
rwkv_gpu_offload_layers(rwkv_ctx_v3,inputs.gpulayers);
438-
}
435+
// if(inputs.gpulayers>0)
436+
// {
437+
// rwkv_gpu_offload_layers(rwkv_ctx_v3,inputs.gpulayers);
438+
// }
439439

440440
const struct rwkv_file_header & header = rwkv_ctx_v3->instance->model.header;
441441
const size_t n_vocab = header.n_vocab;

otherarch/rwkv_v3.cpp

-42
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,6 @@
66
#include "rwkv_v3.h"
77
#include "ggml.h"
88

9-
#ifdef GGML_USE_CUBLAS
10-
#include "ggml-cuda.h"
11-
#endif
12-
#if defined(GGML_USE_CLBLAST)
13-
#include "ggml-opencl.h"
14-
#endif
15-
169
#include <string>
1710
#include <vector>
1811
#include <cstring>
@@ -1065,11 +1058,7 @@ struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
10651058
const size_t n_threads,
10661059
const size_t sequence_len = 1
10671060
) {
1068-
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
1069-
enum ggml_type mul_mat_type = type == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16;
1070-
#else
10711061
enum ggml_type mul_mat_type = ggml_is_quantized(type) ? GGML_TYPE_Q8_1 : type;
1072-
#endif
10731062
return ctx.alloc(GGML_TYPE_I8, rwkv_future_tensor::size(mul_mat_type, ffn_key_height, sequence_len) * n_threads + 64 * (n_threads - 1));
10741063
}
10751064

@@ -1556,38 +1545,7 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
15561545
}
15571546

15581547
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
1559-
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS)
1560-
printf("\nRWKV: Attempting offload of %u layers",n_layers);
1561-
const auto offload = [&](struct ggml_tensor * tensor) {
1562-
// TODO support multi-GPU
1563-
tensor->backend = GGML_BACKEND_GPU;
1564-
#if defined(GGML_USE_CLBLAST)
1565-
ggml_cl_transform_tensor(tensor->data, tensor);
1566-
#else
1567-
ggml_cuda_transform_tensor(tensor->data, tensor);
1568-
#endif
1569-
};
1570-
1571-
const size_t n_gpu = std::min(n_layers, ctx->instance->model.header.n_layer);
1572-
1573-
if (ctx->gpu_layers < n_gpu) {
1574-
for (size_t & i = ctx->gpu_layers; i < n_gpu; i++) {
1575-
const struct rwkv_layer & layer = ctx->instance->model.layers[i];
1576-
1577-
// TODO also offload other operations to GPU with ggml_cuda_assign_buffers
1578-
offload(layer.att_key);
1579-
offload(layer.att_value);
1580-
offload(layer.att_receptance);
1581-
offload(layer.att_output);
1582-
1583-
offload(layer.ffn_key);
1584-
offload(layer.ffn_value);
1585-
offload(layer.ffn_receptance);
1586-
}
15871548

1588-
return true;
1589-
}
1590-
#endif
15911549
return false;
15921550
}
15931551

0 commit comments

Comments
 (0)