Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade v1/v2 format to v3 by leveraging quantize #1504

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,124 @@ typedef struct {
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");

static void quantize_shuffle_block(const uint8_t* src, uint8_t* dest, int half_size)
{
for (int j = 0; j < half_size; j++) {
// old: d0, d1, d2, d3, d4, ....... d_half, d_half1
// new: d0, d_half, d1, d_half1
uint8_t d1;
uint8_t d2;

d1 = src[0 + j];
d2 = src[half_size + j];

dest[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4);
dest[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0);
}
}

typedef struct {
float d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
}block_q4_0_old;
typedef struct {
float d; // delta
float m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1_old;
typedef struct {
float d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0_old;

void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuffle) {
if (type == GGML_TYPE_Q4_0) {
int qk = ggml_blck_size(type);
const size_t nb = *size / sizeof(block_q4_0_old);
block_q4_0_old *blk = (block_q4_0_old *)data;
block_q4_0 *new_blk = (block_q4_0 *)data;
block_q4_0 new_blk_buf;
*size = nb * sizeof(block_q4_0);

for (size_t i = 0; i < nb ; i++) {

new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d);

if (shuffle) {
quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4);
} else {
memcpy(new_blk_buf.qs, blk[i].qs, qk / 2);
}

memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_0));
}
} else if (type == GGML_TYPE_Q4_1) {
int qk = ggml_blck_size(type);
const size_t nb = *size / sizeof(block_q4_1_old);
block_q4_1_old *blk = (block_q4_1_old *)data;
block_q4_1 *new_blk = (block_q4_1 *)data;
block_q4_1 new_blk_buf;
*size = nb * sizeof(block_q4_1);

for (size_t i = 0; i < nb ; i++) {
new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d);
new_blk_buf.m = GGML_FP32_TO_FP16(blk[i].m);

if (shuffle) {
quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4);
} else {
memcpy(new_blk_buf.qs, blk[i].qs, qk / 2);
}
memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_1));
}
} else if (type == GGML_TYPE_Q5_0) {
// No size diff
int qk = ggml_blck_size(type);
const size_t nb = *size / sizeof(block_q5_0);
block_q5_0 *blk = (block_q5_0 *)data;
block_q5_0 new_blk;

for (size_t i = 0; i < nb ; i++) {
if (shuffle) {
quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4);
} else {
memcpy(new_blk.qs, blk[i].qs, qk / 2);
}
memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs));
}
} else if (type == GGML_TYPE_Q5_1) {
// No size diff
int qk = ggml_blck_size(type);
const size_t nb = *size / sizeof(block_q5_1);
block_q5_1 *blk = (block_q5_1 *)data;
block_q5_1 new_blk;

for (size_t i = 0; i < nb ; i++) {
if (shuffle) {
quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4);
} else {
memcpy(new_blk.qs, blk[i].qs, qk / 2);
}
memcpy(&blk[i], &new_blk, sizeof(new_blk));
}
} else if (type == GGML_TYPE_Q8_0) {
// no shuffle
int qk = ggml_blck_size(type);
const size_t nb = *size / sizeof(block_q8_0_old);
block_q8_0_old *blk = (block_q8_0_old *)data;
block_q8_0 *new_blk = (block_q8_0 *)data;
block_q8_0 new_blk_buf;
*size = nb * sizeof(block_q8_0);

for (size_t i = 0; i < nb ; i++) {
new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d);

memcpy(new_blk_buf.qs, blk[i].qs, qk / 2);
memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q8_0));
}
}
}

// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
static const int qk = QK4_0;
Expand Down
1 change: 1 addition & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,7 @@ extern "C" {

GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t *size, bool needShuffle);
//
// system info
//
Expand Down
81 changes: 64 additions & 17 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,14 @@ struct llama_context {
}
};

enum llama_file_version {
LLAMA_FILE_VERSION_GGML,
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
};

template <typename T>
static T checked_mul(T a, T b) {
T ret = a * b;
Expand Down Expand Up @@ -305,15 +313,43 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
return size / ggml_blck_size(type);
}

static size_t llama_calc_tensor_size_prev3(const std::vector<uint32_t> & ne, enum ggml_type type) {
size_t size = ggml_type_size(type);

switch (type)
{
case GGML_TYPE_Q4_0:
size += 2;
break;
case GGML_TYPE_Q4_1:
size += 4;
break;
case GGML_TYPE_Q8_0:
size += 2;
break;
default:
break;
}

for (uint32_t dim : ne) {
size = checked_mul<size_t>(size, dim);
}
return size / ggml_blck_size(type);
}

struct llama_load_tensor_shard {
std::vector<uint32_t> ne;
size_t size;
enum ggml_type type;
size_t file_idx;
size_t file_off;

void calc_size() {
size = llama_calc_tensor_size(ne, type);
void calc_size(llama_file_version file_version) {
if (file_version == LLAMA_FILE_VERSION_GGJT_V3) {
size = llama_calc_tensor_size(ne, type);
} else {
size = llama_calc_tensor_size_prev3(ne, type);
}
}
};

Expand All @@ -336,11 +372,11 @@ struct llama_load_tensor {

llama_load_tensor(const std::string & name) : name(name) {}

void calc_all() {
void calc_all(llama_file_version file_version) {
calc_type();
calc_split_type();
calc_ne();
calc_size();
calc_size(file_version);
}

void calc_type() {
Expand Down Expand Up @@ -392,8 +428,12 @@ struct llama_load_tensor {
}
}

void calc_size() {
size = llama_calc_tensor_size(ne, type);
void calc_size(llama_file_version file_version) {
if (file_version == LLAMA_FILE_VERSION_GGJT_V3) {
size = llama_calc_tensor_size(ne, type);
} else {
size = llama_calc_tensor_size_prev3(ne, type);
}
}
};

Expand All @@ -403,14 +443,6 @@ struct llama_load_tensors_map {
std::unordered_map<std::string, size_t> name_to_idx;
};

enum llama_file_version {
LLAMA_FILE_VERSION_GGML,
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
LLAMA_FILE_VERSION_GGJT_V1, // added padding
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
};

struct llama_file_loader {
llama_file file;
llama_file_version file_version;
Expand Down Expand Up @@ -513,7 +545,7 @@ struct llama_file_loader {
shard.file_idx = file_idx;
shard.file_off = file.tell();

shard.calc_size();
shard.calc_size(file_version);
file.seek(shard.size, SEEK_CUR);

auto it = tensors_map.name_to_idx.find(name);
Expand Down Expand Up @@ -618,7 +650,7 @@ struct llama_model_loader {
}
this->use_mmap = use_mmap;
for (llama_load_tensor & lt : tensors_map.tensors) {
lt.calc_all();
lt.calc_all(first_file->file_version);
}
}

Expand Down Expand Up @@ -2074,7 +2106,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t new_size;
llama_buffer work;

if (!quantize) {
bool needShuffle = (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1);

if (model_loader->file_loaders.at(0)->file_version < LLAMA_FILE_VERSION_GGJT_V3 && quantize) {
if ((quantized_type == tensor.type) &&
(tensor.type == GGML_TYPE_Q4_0 || tensor.type == GGML_TYPE_Q4_1 || tensor.type == GGML_TYPE_Q5_0 || tensor.type == GGML_TYPE_Q5_1 || tensor.type == GGML_TYPE_Q8_0)) {
// convet
new_type = tensor.type;
new_data = tensor.data;
new_size = tensor.size;
quantize_upgrade(new_type, new_data, &new_size, needShuffle);
printf("Upgrade - size = %8.3f MB\n", new_size/1024.0/1024.0);
}
else {
throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type));
}
} else if (!quantize) {
new_type = tensor.type;
new_data = tensor.data;
new_size = tensor.size;
Expand Down