Skip to content

Commit

Permalink
Custom RoPE + bettter memory management for CUDA (#2295)
Browse files Browse the repository at this point in the history
* Custom RoPE + bettter memory management for CUDA

* Adjusted look ahead in ggml_cuda_pool_malloc to 5%

This is sufficient it seems.
We end up using about 200 MB less VRAM that way when running
the 13B model with context 8192.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
  • Loading branch information
ikawrakow and Kawrakow authored Jul 21, 2023
1 parent 4d76a5f commit d924522
Showing 1 changed file with 49 additions and 11 deletions.
60 changes: 49 additions & 11 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2423,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaGetDevice(&id));

#ifdef DEBUG_CUDA_MALLOC
int nnz = 0;
size_t max_size = 0, tot_size = 0;
#endif
size_t best_diff = 1ull << 36;
int ibest = -1;
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.size >= size && b.ptr != nullptr) {
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
if (b.ptr != nullptr) {
#ifdef DEBUG_CUDA_MALLOC
++nnz;
tot_size += b.size;
if (b.size > max_size) max_size = b.size;
#endif
if (b.size >= size) {
size_t diff = b.size - size;
if (diff < best_diff) {
best_diff = diff;
ibest = i;
if (!best_diff) {
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
}
}
}
}
if (ibest >= 0) {
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
#ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
#endif
void * ptr;
CUDA_CHECK(cudaMalloc((void **) &ptr, size));
*actual_size = size;
size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
*actual_size = look_ahead_size;
return ptr;
}

Expand Down Expand Up @@ -2955,8 +2988,13 @@ inline void ggml_cuda_op_rope(
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];

const float theta_scale = powf(10000.0, -2.0f/n_dims);
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
// RoPE alteration for extended context
float freq_base, freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));

const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;

bool is_glm = mode & 4;

Expand Down

0 comments on commit d924522

Please sign in to comment.