Skip to content

Commit 1b55412

Browse files
authored
Memory optimizations (ggml-org#44)
* RAM usage reduction and calculations Removed -b batch limit (1024) (tested up to-b 8192) Fixed a integer overflow in ggml matmul (happened at around nbatch 3000) Added a dynamic calculation for batched scratch memory consumption Overall reduced RAM buffer sizes by magnitudes for normal settings RAM usage scales quadratically with increasing context size * batch Using a small batch (or default 1) will result in a very small memory footprint even at thousands of tokens processed Tested up to 13,000 tokens prompt and 8k batch Needs more tests on various platforms * removed debug * minor ---------
1 parent 8c02206 commit 1b55412

8 files changed

+289
-150
lines changed

examples/falcon/falcon_main.cpp

+22-20
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ int main(int argc, char ** argv) {
151151
return 1;
152152
}
153153

154+
154155
#if defined(GGML_USE_CUBLAS)
155156
// wait for cublas and show device information
156157
{
@@ -168,14 +169,15 @@ int main(int argc, char ** argv) {
168169
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
169170
// uncomment the "used_mem" line in llama.cpp to see the results
170171
if (params.mem_test) {
172+
falcon_prepare_buffers(ctx, params.n_batch, params.n_ctx);
171173
{
172-
const std::vector<llama_token> tmp(params.n_batch, falcon_token_bos());
173-
falcon_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads,params.debug_timings);
174+
const std::vector<falcon_token> tmp((int)params.n_batch, falcon_token_bos());
175+
falcon_eval(ctx, tmp.data(), (int)tmp.size(), 0, params.n_threads,params.debug_timings);
174176
}
175177

176178
{
177-
const std::vector<llama_token> tmp = { 0, };
178-
falcon_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads,params.debug_timings);
179+
const std::vector<falcon_token> tmp = { 0, };
180+
falcon_eval(ctx, tmp.data(), (int)tmp.size(), params.n_predict - 1, params.n_threads,params.debug_timings);
179181
}
180182

181183
falcon_print_timings(ctx);
@@ -193,7 +195,7 @@ int main(int argc, char ** argv) {
193195
}
194196

195197
std::string path_session = params.path_prompt_cache;
196-
std::vector<llama_token> session_tokens;
198+
std::vector<falcon_token> session_tokens;
197199

198200
if (!path_session.empty()) {
199201
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
@@ -219,7 +221,7 @@ int main(int argc, char ** argv) {
219221
}
220222

221223
// tokenize the prompt
222-
std::vector<llama_token> embd_inp;
224+
std::vector<falcon_token> embd_inp;
223225

224226
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
225227
// Falcon does not have a dedicated bos token (bos==eos), so don't inject it here
@@ -239,11 +241,11 @@ int main(int argc, char ** argv) {
239241
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
240242
return 1;
241243
}
242-
244+
falcon_prepare_buffers(ctx, params.n_batch, embd_inp.size()+1);
243245
// debug message about similarity of saved session, if applicable
244246
size_t n_matching_session_tokens = 0;
245247
if (session_tokens.size()) {
246-
for (llama_token id : session_tokens) {
248+
for (falcon_token id : session_tokens) {
247249
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
248250
break;
249251
}
@@ -275,8 +277,8 @@ int main(int argc, char ** argv) {
275277
}
276278

277279
// prefix & suffix for instruct mode
278-
std::vector<llama_token> inp_pfx;
279-
std::vector<llama_token> inp_sfx;
280+
std::vector<falcon_token> inp_pfx;
281+
std::vector<falcon_token> inp_sfx;
280282

281283
// in instruct mode, we inject a prefix and a suffix to each input by the user
282284
if (params.instruct) {
@@ -294,7 +296,7 @@ int main(int argc, char ** argv) {
294296

295297
// determine newline token
296298
//auto llama_token_newline = ::falcon_tokenize(ctx, "\n", false);
297-
auto llama_token_newline = std::vector<llama_token>(193);
299+
auto llama_token_newline = std::vector<falcon_token>(193);
298300

299301
if (params.verbose_prompt) {
300302
fprintf(stderr, "\n");
@@ -359,7 +361,7 @@ fprintf(stderr, "| | %5d | %.3f | %.3f | %.3f | %5d | %.3f | %.3f | %
359361
fprintf(stderr, "+============+=======+=======+=======+=======+=======+=======+-------+-------+------+------+--------+---------+\n");
360362

361363
fprintf(stderr, "| %10s | %5s | %5s | %5s | %5s | %13s |\n",
362-
"Generation", "Ctx", "Batch", "Keep","Prmpt","Seed");
364+
"Generation", "Ctx", "Batch", "Keep","Prom.","Seed");
363365
fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\n");
364366
fprintf(stderr, "| | %5d | %5d | %5d | %5zu | %13d |\n",
365367
n_ctx, params.n_batch, params.n_keep, embd_inp.size(),params.seed);
@@ -372,7 +374,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
372374
fprintf(stderr, "\n\n");
373375

374376
// TODO: replace with ring-buffer
375-
std::vector<llama_token> last_n_tokens(n_ctx);
377+
std::vector<falcon_token> last_n_tokens(n_ctx);
376378
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
377379

378380
if (params.interactive) {
@@ -406,11 +408,11 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
406408
// the first thing we will do is to output the prompt, so set color accordingly
407409
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
408410

409-
std::vector<llama_token> embd;
411+
std::vector<falcon_token> embd;
410412

411413
// do one empty run to warm up the model
412414
{
413-
const std::vector<llama_token> tmp = { falcon_token_bos(), };
415+
const std::vector<falcon_token> tmp = { falcon_token_bos(), };
414416
falcon_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads,0);
415417
llama_reset_timings(ctx);
416418
}
@@ -521,7 +523,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
521523
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
522524
}
523525

524-
llama_token id = 0;
526+
falcon_token id = 0;
525527

526528
{
527529
auto logits = falcon_get_logits(ctx);
@@ -532,13 +534,13 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\
532534
logits[it->first] += it->second;
533535
}
534536

535-
std::vector<llama_token_data> candidates;
537+
std::vector<falcon_token_data> candidates;
536538
candidates.reserve(n_vocab);
537-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
538-
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
539+
for (falcon_token token_id = 0; token_id < n_vocab; token_id++) {
540+
candidates.emplace_back(falcon_token_data{token_id, logits[token_id], 0.0f});
539541
}
540542

541-
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
543+
falcon_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
542544

543545
// Apply penalties
544546
float nl_logit = logits[falcon_token_nl()];

examples/falcon_common.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
265265
break;
266266
}
267267
params.n_batch = std::stoi(argv[i]);
268-
params.n_batch = std::min(1024+128, params.n_batch); // appears to work fine with scratch buffer, keep in eye
268+
// params.n_batch = std::min(1024+128, params.n_batch); // appears to work fine with scratch buffer, keep in eye
269269
} else if (arg == "--keep") {
270270
if (++i >= argc) {
271271
invalid_param = true;
@@ -428,7 +428,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
428428
break;
429429
}
430430
std::stringstream ss(argv[i]);
431-
llama_token key;
431+
falcon_token key;
432432
char sign;
433433
std::string value_str;
434434
try {
@@ -602,9 +602,9 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
602602
}
603603

604604
// TODO: not great allocating this every time
605-
std::vector<llama_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos) {
605+
std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos) {
606606
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
607-
std::vector<llama_token> res(text.size() + (int) add_bos);
607+
std::vector<falcon_token> res(text.size() + (int) add_bos);
608608
const int n = falcon_tokenize(ctx, text.c_str(), res.data(), static_cast<int>(res.size()), add_bos);
609609
assert(n >= 0);
610610
res.resize(n);

examples/falcon_common.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ struct gpt_params {
3636
// int mb_reserve_gpu_other = false; // override reserved megabytes of VRAM for secondary GPUs
3737

3838
// sampling parameters
39-
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
39+
std::unordered_map<falcon_token, float> logit_bias; // logit bias for specific tokens
4040
int32_t top_k = 40; // <= 0 to use vocab size
4141
float top_p = 0.95f; // 1.0 = disabled
4242
float tfs_z = 1.00f; // 1.0 = disabled
@@ -93,7 +93,7 @@ std::string gpt_random_prompt(std::mt19937 & rng);
9393
// Vocab utils
9494
//
9595

96-
std::vector<llama_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos);
96+
std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std::string & text, bool add_bos);
9797

9898
//
9999
// Model utils

ggml-cuda.cu

+9-9
Original file line numberDiff line numberDiff line change
@@ -1944,30 +1944,30 @@ void ggml_cuda_update_gpu_status(int device_id) {
19441944
}
19451945
void ggml_cuda_print_gpu_status(const GPUStatus *status, bool print_summary) {
19461946
if (status == NULL) {
1947-
printf("Error: Invalid GPU status pointer.\n");
1947+
fprintf(stderr,"Error: Invalid GPU status pointer.\n");
19481948
return;
19491949
}
19501950

19511951
const char *divider = "+----+------------------------------------+------------+-----------+-----------+-----------+-----------+";
1952-
printf("%s\n", divider);
1953-
printf("| ID | %-25s %2d found | %10s | %9s | %9s | %9s | %9s |\n", "Device", status->num_devices, "VRAM Total", "VRAM Free", "VRAM Used","Split at ", "Device");
1954-
printf("%s\n", divider);
1952+
fprintf(stderr,"%s\n", divider);
1953+
fprintf(stderr,"| ID | %-25s %2d found | %10s | %9s | %9s | %9s | %9s |\n", "Device", status->num_devices, "VRAM Total", "VRAM Free", "VRAM Used","Split at ", "Device");
1954+
fprintf(stderr,"%s\n", divider);
19551955

19561956
for (int i = 0; i < status->num_devices; ++i) {
19571957
const struct cudaDeviceProp *prop = &status->device_props[i];
19581958
size_t vram_used = status->device_vram_total[i] - status->device_vram_free[i];
19591959
float split_at_percentage = g_tensor_split[i] * 100;
1960-
printf("| %2d | %-34s | %7zu MB | %6zu MB | %6zu MB | %8.1f%% | %9s |\n",
1960+
fprintf(stderr,"| %2d | %-34s | %7zu MB | %6zu MB | %6zu MB | %8.1f%% | %9s |\n",
19611961
i,prop->name, status->device_vram_total[i] / (1024 * 1024), status->device_vram_free[i] / (1024 * 1024), vram_used / (1024 * 1024),split_at_percentage, (i == status->main_device_id) ? "Primary" : "Secondary");
19621962
// printf("%s\n", divider);
19631963
}
19641964
if (print_summary && status->num_devices > 1)
19651965
{
1966-
printf("%s\n", divider);
1967-
printf("| | %-34s | %7zu MB | %6zu MB | %6zu MB | %9s | %9s |\n",
1966+
fprintf(stderr,"%s\n", divider);
1967+
fprintf(stderr,"| | %-34s | %7zu MB | %6zu MB | %6zu MB | %9s | %9s |\n",
19681968
"Device summary", status->total_vram / (1024 * 1024), status->total_free_vram / (1024 * 1024), (status->total_vram - status->total_free_vram) / (1024 * 1024), "N/A", "All");
19691969
}
1970-
printf("%s\n", divider);
1970+
fprintf(stderr,"%s\n", divider);
19711971

19721972
}
19731973

@@ -2090,7 +2090,7 @@ void * ggml_cuda_host_malloc(size_t size) {
20902090
// The allocation error can be bypassed. A null ptr will assigned out of this function.
20912091
// This can fixed the OOM error in WSL.
20922092
cudaGetLastError();
2093-
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
2093+
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned (CUDA optimized) memory: %s\n",
20942094
size/1024.0/1024.0, cudaGetErrorString(err));
20952095
return nullptr;
20962096
}

ggml.c

+51-20
Original file line numberDiff line numberDiff line change
@@ -4539,7 +4539,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
45394539
size_needed += GGML_TENSOR_SIZE;
45404540

45414541
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4542-
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4542+
GGML_PRINT("\n%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
45434543
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
45444544
assert(false);
45454545
return NULL;
@@ -4552,14 +4552,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
45524552
};
45534553
} else {
45544554
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4555-
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4555+
GGML_PRINT("\n%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
45564556
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
45574557
assert(false);
45584558
return NULL;
45594559
}
45604560

45614561
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4562-
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4562+
GGML_PRINT("\n%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
45634563
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
45644564
assert(false);
45654565
return NULL;
@@ -10921,7 +10921,7 @@ static void ggml_compute_forward_mul_mat_f32(
1092110921
const int64_t ne02 = src0->ne[2];
1092210922
const int64_t ne03 = src0->ne[3];
1092310923

10924-
const int64_t ne10 = src1->ne[0];
10924+
// const int64_t ne10 = src1->ne[0];
1092510925
const int64_t ne11 = src1->ne[1];
1092610926
const int64_t ne12 = src1->ne[2];
1092710927
const int64_t ne13 = src1->ne[3];
@@ -10931,20 +10931,20 @@ static void ggml_compute_forward_mul_mat_f32(
1093110931
const int64_t ne2 = dst->ne[2];
1093210932
const int64_t ne3 = dst->ne[3];
1093310933

10934-
const int nb00 = src0->nb[0];
10935-
const int nb01 = src0->nb[1];
10936-
const int nb02 = src0->nb[2];
10937-
const int nb03 = src0->nb[3];
10934+
const size_t nb00 = src0->nb[0];
10935+
const size_t nb01 = src0->nb[1];
10936+
const size_t nb02 = src0->nb[2];
10937+
const size_t nb03 = src0->nb[3];
1093810938

10939-
const int nb10 = src1->nb[0];
10940-
const int nb11 = src1->nb[1]; UNUSED(nb11);
10941-
const int nb12 = src1->nb[2]; UNUSED(nb12);
10942-
const int nb13 = src1->nb[3]; UNUSED(nb13);
10939+
const size_t nb10 = src1->nb[0];
10940+
const size_t nb11 = src1->nb[1]; UNUSED(nb11);
10941+
const size_t nb12 = src1->nb[2]; UNUSED(nb12);
10942+
const size_t nb13 = src1->nb[3]; UNUSED(nb13);
1094310943

10944-
const int nb0 = dst->nb[0];
10945-
const int nb1 = dst->nb[1];
10946-
const int nb2 = dst->nb[2];
10947-
const int nb3 = dst->nb[3];
10944+
const size_t nb0 = dst->nb[0];
10945+
const size_t nb1 = dst->nb[1];
10946+
const size_t nb2 = dst->nb[2];
10947+
const size_t nb3 = dst->nb[3];
1094810948

1094910949
const int ith = params->ith;
1095010950
const int nth = params->nth;
@@ -10962,7 +10962,17 @@ static void ggml_compute_forward_mul_mat_f32(
1096210962
GGML_ASSERT(nb0 == sizeof(float));
1096310963
GGML_ASSERT(nb0 <= nb1);
1096410964
GGML_ASSERT(nb1 <= nb2);
10965-
GGML_ASSERT(nb2 <= nb3);
10965+
if (!(nb2 <= nb3) )
10966+
{
10967+
fprintf(stderr,"nb2=%zu nb3=%zu Assert condition is %s\n",nb2,nb3,nb2 <= nb3 ? "true" : "false");
10968+
ggml_tensor_printf(dst,"",0,true,false);
10969+
10970+
}
10971+
GGML_ASSERT(nb2 <= nb3);
10972+
10973+
10974+
10975+
1096610976

1096710977
//GGML_ASSERT(ne0 == ne01);
1096810978
//GGML_ASSERT(ne1 == ne11);
@@ -12840,7 +12850,7 @@ static void ggml_compute_forward_rope_f32(
1284012850
const size_t nb2 = dst->nb[2];
1284112851
const size_t nb3 = dst->nb[3];
1284212852

12843-
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
12853+
// printf("%s[%d] ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", dst->name,dst->meta.layer_id,ne0, ne1, ne2, ne3);
1284412854
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
1284512855

1284612856
GGML_ASSERT(nb00 == sizeof(float));
@@ -19525,7 +19535,7 @@ void ggml_printTensorSample(char *prefix,const struct ggml_tensor * tensor) {
1952519535
printf("%s", sep);
1952619536
printf("| Content of %s \"%s\" (%d dim)",prefix,tensor->name,tensor->n_dims);
1952719537
printf("\n");
19528-
const int max_elements = 4;
19538+
const int max_elements = 40000;
1952919539

1953019540
if (tensor->n_dims == 1) {
1953119541
printf("| ");
@@ -19595,7 +19605,7 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
1959519605
*/
1959619606
{
1959719607
pos = 0;
19598-
for (int i = 0; i < tensor->n_dims; i++) {
19608+
for (int i = 0; i <= tensor->n_dims; i++) {
1959919609
pos += snprintf(strides + pos, sizeof(strides) - pos, "%" PRId64, tensor->nb[i]);
1960019610
if (i != tensor->n_dims - 1) {
1960119611
pos += snprintf(strides + pos, sizeof(strides) - pos, "x");
@@ -19683,7 +19693,28 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
1968319693
}
1968419694
printf("%s\n", sep_border);
1968519695
}
19696+
float ggml_get_tensor_index(const struct ggml_tensor* tensor, int ind1, int ind2, int ind3, int ind4) {
19697+
if (tensor->n_dims < 1 || tensor->n_dims > 4) {
19698+
printf("Error: Incorrect dimension number %d\n", tensor->n_dims);
19699+
return -1; // handle error
19700+
}
19701+
19702+
int indices[4] = {ind1, ind2, ind3, ind4};
19703+
int total_offset = 0;
19704+
19705+
for (int i = 0; i < tensor->n_dims; i++) {
19706+
if (indices[i] > tensor->ne[i] || indices[i] < 0) {
19707+
printf("Error: Incorrect index for dimension %d\n", i);
19708+
printf("Index: %d, Dimension size: %ld\n", indices[i], tensor->ne[i]);
19709+
return -1; // handle error
19710+
}
19711+
19712+
total_offset += indices[i] * tensor->nb[i];
19713+
}
1968619714

19715+
// Return the value at the calculated offset
19716+
return *(float *)((char *) tensor->data + total_offset);
19717+
}
1968719718
////////////////////////////////////////////////////////////////////////////////
1968819719

1968919720
int ggml_cpu_has_avx(void) {

ggml.h

+2
Original file line numberDiff line numberDiff line change
@@ -1345,6 +1345,8 @@ extern "C" {
13451345

13461346
// visualize the tensor - extended adds more information - when printing sample content extended will also print src0 and src1 content
13471347
void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line, bool extended, bool print_sample);
1348+
// helper to accessa specific single index value (tested for fp32 only, though nb[] is considered)
1349+
float ggml_get_tensor_index(const struct ggml_tensor* tensor, int ind1, int ind2, int ind3, int ind4);
13481350

13491351
//
13501352
// optimization

0 commit comments

Comments
 (0)