Skip to content

Commit

Permalink
examples : use scratch buffers to reduce memory usage (#176)
Browse files Browse the repository at this point in the history
* starcoder : example for using scratch buffers to reduce memory usage

* starcoder : bump scratch buffers to 256 MB

* examples : add scratch buffers to MPT and GPT-NeoX
  • Loading branch information
ggerganov authored May 20, 2023
1 parent efb37a3 commit d695755
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 4 deletions.
16 changes: 16 additions & 0 deletions examples/gpt-neox/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,14 @@ bool gpt_neox_eval(
static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);

// use 2 scratch buffers
// TODO: very hacky solution - reimplement in a more elegant way
static size_t scr0_size = 256u*1024*1024;
static void * scr0 = malloc(scr0_size);

static size_t scr1_size = 256u*1024*1024;
static void * scr1 = malloc(scr1_size);

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
Expand Down Expand Up @@ -477,6 +485,8 @@ bool gpt_neox_eval(
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;

ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });

// self-attention
{
{
Expand Down Expand Up @@ -580,6 +590,8 @@ bool gpt_neox_eval(
}
}

ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });

if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);

Expand All @@ -602,6 +614,8 @@ bool gpt_neox_eval(
}
}

ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });

// norm
{
inpL = ggml_norm(ctx0, inpL);
Expand All @@ -614,6 +628,8 @@ bool gpt_neox_eval(
ggml_repeat(ctx0, model.ln_f_b, inpL));
}

ggml_set_scratch(ctx0, { 0, 0, nullptr, });

// lm_head
{
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
Expand Down
17 changes: 16 additions & 1 deletion examples/mpt/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,14 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
static size_t buf_size = 256u * 1024 * 1024;
static void * buf = malloc(buf_size);

// use 2 scratch buffers
// TODO: very hacky solution - reimplement in a more elegant way
static size_t scr0_size = 256u*1024*1024;
static void * scr0 = malloc(scr0_size);

static size_t scr1_size = 256u*1024*1024;
static void * scr1 = malloc(scr1_size);

if (mem_per_token > 0 && mem_per_token * N > buf_size) {
const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
Expand Down Expand Up @@ -380,6 +388,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,

struct ggml_tensor * cur;

ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });

// a = self.ln_1(x)
{
cur = ggml_norm(ctx0, inpL);
Expand All @@ -392,7 +402,6 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
// attn_bias=attn_bias, attention_mask=attention_mask,
// is_causal=is_causal)
{

// compute QKV
cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);

Expand Down Expand Up @@ -475,6 +484,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,

inpL = ggml_add(ctx0, inpL, cur);

ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });

// m = self.ln_2(x)
{
cur = ggml_norm(ctx0, inpL);
Expand All @@ -499,13 +510,17 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
inpL = ggml_add(ctx0, inpL, cur);
}

ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });

// norm
{
inpL = ggml_norm(ctx0, inpL);
// inpL = ln_f_g*inpL
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
}

ggml_set_scratch(ctx0, { 0, 0, nullptr, });

// output embedding weight tied to input embedding
inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);

Expand Down
20 changes: 18 additions & 2 deletions examples/starcoder/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,14 @@ bool starcoder_eval(
static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);

// use 2 scratch buffers
// TODO: very hacky solution - reimplement in a more elegant way
static size_t scr0_size = 256u*1024*1024;
static void * scr0 = malloc(scr0_size);

static size_t scr1_size = 256u*1024*1024;
static void * scr1 = malloc(scr1_size);

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
Expand Down Expand Up @@ -456,6 +464,8 @@ bool starcoder_eval(
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;

ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });

// norm
{
// [ 768, N]
Expand Down Expand Up @@ -519,7 +529,7 @@ bool starcoder_eval(
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3); //TODO: need to be tiled
0, 2, 1, 3); //TODO: need to be tiled

// GG: flash attention
//struct ggml_tensor * V =
Expand Down Expand Up @@ -602,6 +612,8 @@ bool starcoder_eval(

struct ggml_tensor * inpFF = cur;

ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });

// feed-forward network
{
// norm
Expand Down Expand Up @@ -658,6 +670,8 @@ bool starcoder_eval(
inpL = ggml_add(ctx0, cur, inpFF);
}

ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });

// norm
{
// [ 768, N]
Expand All @@ -672,6 +686,8 @@ bool starcoder_eval(
ggml_repeat(ctx0, model.ln_f_b, inpL));
}

ggml_set_scratch(ctx0, { 0, 0, nullptr, });

// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
Expand Down Expand Up @@ -699,7 +715,7 @@ bool starcoder_eval(
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
//printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));

ggml_free(ctx0);

Expand Down
3 changes: 2 additions & 1 deletion src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4077,7 +4077,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false);
return NULL;
}
Expand Down

0 comments on commit d695755

Please sign in to comment.