Skip to content

Commit

Permalink
ggml : sync all changes from llama.cpp and whisper.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Mar 29, 2023
1 parent 3adf02e commit 503722c
Show file tree
Hide file tree
Showing 12 changed files with 2,534 additions and 2,024 deletions.
42 changes: 22 additions & 20 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct gpt2_layer {
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;

struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};

Expand Down Expand Up @@ -231,23 +231,23 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];

layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);

layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

// map by name
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
Expand All @@ -265,7 +265,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;

model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
}
}
Expand Down Expand Up @@ -537,11 +537,13 @@ bool gpt2_eval(
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3);
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));

// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
Expand Down Expand Up @@ -625,7 +627,7 @@ bool gpt2_eval(
// cur = proj_w*cur + proj_b
// [768, N]
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w_trans,
model.layers[il].c_mlp_proj_w,
cur);

cur = ggml_add(ctx0,
Expand Down
7 changes: 2 additions & 5 deletions examples/gpt-2/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
#include <vector>
#include <regex>

// TODO: move somewhere else
#define QK 32

// default hparams (GPT-2 117M)
struct gpt2_hparams {
int32_t n_vocab = 50257;
Expand Down Expand Up @@ -223,11 +220,11 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
default:
{
Expand Down
40 changes: 21 additions & 19 deletions examples/gpt-j/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ struct gptj_layer {
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;

struct ggml_tensor * c_mlp_proj_w_trans;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};

Expand Down Expand Up @@ -180,7 +180,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w_trans
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
Expand Down Expand Up @@ -236,20 +236,20 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];

layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);

layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);

layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

// map by name
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g;
Expand All @@ -264,7 +264,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b;

model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w_trans;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b;
}
}
Expand Down Expand Up @@ -510,11 +510,13 @@ bool gptj_eval(

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V_trans =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3);
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));

// KQV = transpose(V) * KQ_soft_max
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
Expand Down Expand Up @@ -553,7 +555,7 @@ bool gptj_eval(
// projection
// cur = proj_w*cur + proj_b
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w_trans,
model.layers[il].c_mlp_proj_w,
cur);

cur = ggml_add(ctx0,
Expand Down
7 changes: 2 additions & 5 deletions examples/gpt-j/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
#include <vector>
#include <regex>

// TODO: move somewhere else
#define QK 32

// default hparams (GPT-J 6B)
struct gptj_hparams {
int32_t n_vocab = 50400;
Expand Down Expand Up @@ -225,11 +222,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
default:
{
Expand Down
110 changes: 0 additions & 110 deletions examples/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,113 +328,3 @@ gpt_vocab::id gpt_sample_top_k_top_p(

return logits_id[idx].second;
}

size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t row_size = nb*(sizeof(float) + sizeof(uint8_t)*qk/2);

assert(k % qk == 0);

uint8_t pp[qk/2];

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
float * pd = (float *) (pdst + (j/k)*row_size);
uint8_t * pb = (uint8_t *) (pd + nb);

for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max

{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
amax = std::max(amax, fabsf(v));
}

const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;

pd[i] = d;

for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0])*id;
const float v1 = (src[j + i*qk + l + 1])*id;

const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);

hist[vi0]++;
hist[vi1]++;

pp[l/2] = vi0 | (vi1 << 4);
}

memcpy(pb + i*qk/2, pp, sizeof(pp));
}
}
}

return (n/k)*row_size;
}

size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);

assert(k % qk == 0);

uint8_t pp[qk/2];

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
float * pm = (float *) (pdst + (j/k)*row_size);
float * pd = (float *) (pm + nb);
uint8_t * pb = (uint8_t *) (pd + nb);

//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);

for (int i = 0; i < nb; i++) {
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();

{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
if (v < min) min = v;
if (v > max) max = v;
}

const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;

pm[i] = min;
pd[i] = d;

for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id;
const float v1 = (src[j + i*qk + l + 1] - min)*id;

const uint8_t vi0 = round(v0);
const uint8_t vi1 = round(v1);

assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);

hist[vi0]++;
hist[vi1]++;

pp[l/2] = vi0 | (vi1 << 4);
}

memcpy(pb + i*qk/2, pp, sizeof(pp));
}
}
}

return (n/k)*row_size;
}
9 changes: 1 addition & 8 deletions examples/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct gpt_params {
// sampling parameters
int32_t top_k = 40;
float top_p = 0.9f;
float temp = 1.0f;
float temp = 0.9f;

int32_t n_batch = 8; // batch size for prompt processing

Expand Down Expand Up @@ -81,10 +81,3 @@ gpt_vocab::id gpt_sample_top_k_top_p(
double top_p,
double temp,
std::mt19937 & rng);

//
// Quantization
//

size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
Loading

0 comments on commit 503722c

Please sign in to comment.