Skip to content

llama : try loading tensors with pre-computed hashes #13106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions examples/gguf-hash/gguf-hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ typedef enum {

struct hash_params {
std::string input;
bool fnv = false;
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
Expand Down Expand Up @@ -103,6 +104,7 @@ static void hash_print_usage(const char * executable) {
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --fnv use FNV-1a hash\n");
printf(" --xxh64 use xxh64 hash\n");
printf(" --sha1 use sha1 hash\n");
printf(" --sha256 use sha256 hash\n");
Expand Down Expand Up @@ -131,6 +133,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
exit(0);
}

if (arg == "--fnv") {
arg_found = true;
params.fnv = true;
}

if (arg == "--xxh64") {
arg_found = true;
params.xxh64 = true;
Expand Down Expand Up @@ -283,6 +290,18 @@ static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char u
uuid[ 8] |= (0x8 << 4);
}

// Computes FNV-1a hash of the data
static uint64_t fnv_hash(const uint8_t * data, size_t len) {
const uint64_t fnv_prime = 0x100000001b3ULL;
uint64_t hash = 0xcbf29ce484222325ULL;

for (size_t i = 0; i < len; ++i) {
hash ^= data[i];
hash *= fnv_prime;
}
return hash;
}

static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
const std::string & fname = hash_params.input;
struct ggml_context * ctx_data = NULL;
Expand Down Expand Up @@ -326,7 +345,11 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
}

struct gguf_context * ctx_out = gguf_init_empty();
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

gguf_set_kv(ctx_out, ctx);

const int n_tensors = gguf_get_n_tensors(ctx);
bool tensor_layer_in_manifest = false;
bool model_in_manifest = false;
Expand All @@ -335,10 +358,26 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
gguf_add_tensor(ctx_out, cur);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
const std::string tensor_layer_name = fname + ":" + name;

if (hash_params.fnv) {
uint64_t hash = fnv_hash((const uint8_t *)raw_data, n_bytes);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}

printf("%-8s %-s %s\n", "fnv", hex_result, tensor_layer_name.c_str());

char hash_key[128];
snprintf(hash_key, sizeof(hash_key), "%s_hash", name);
gguf_set_val_u64(ctx_out, hash_key, hash);
}

if (hash_params.xxh64) {

if (!hash_params.no_layer) {
Expand Down Expand Up @@ -580,6 +619,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
}
}

auto fname_out = fname + ".rpc";
gguf_write_to_file(ctx_out, fname_out.c_str(), false);
gguf_free(ctx_out);

ggml_free(ctx_data);
gguf_free(ctx);
Expand Down Expand Up @@ -663,7 +705,7 @@ int main(int argc, const char ** argv) {

// Autoselect the highest security hash if manifest is provided but
// the user has not specifically defined the hash they care about
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
// User has not selected a specific value, pick most secure hash
if (manifest_check.sha256) {
params.sha256 = true;
Expand All @@ -680,7 +722,7 @@ int main(int argc, const char ** argv) {
}

// By default if no swich argument provided, assume xxh64
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
params.xxh64 = true;
}

Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml-rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, cons
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
GGML_BACKEND_API bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor,
size_t offset, uint64_t hash);

#ifdef __cplusplus
}
Expand Down
63 changes: 37 additions & 26 deletions ggml/src/ggml-rpc/ggml-rpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ struct rpc_msg_buffer_clear_req {
uint8_t value;
};

struct rpc_msg_set_tensor_hash_req {
rpc_tensor tensor;
uint64_t offset;
uint64_t hash;
};

struct rpc_msg_set_tensor_hash_rsp {
uint8_t result;
};
Expand Down Expand Up @@ -543,15 +549,12 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
rpc_tensor rpc_tensor = serialize_tensor(tensor);
if (size > HASH_THRESHOLD) {
// input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
std::vector<uint8_t> input(input_size, 0);
uint64_t hash = fnv_hash((const uint8_t*)data, size);
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
rpc_msg_set_tensor_hash_req request;
request.tensor = serialize_tensor(tensor);
request.offset = offset;
request.hash = fnv_hash((const uint8_t*)data, size);
rpc_msg_set_tensor_hash_rsp response;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
GGML_ASSERT(status);
if (response.result) {
// the server has the same data, no need to send it
Expand Down Expand Up @@ -597,6 +600,18 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
return response.result;
}

bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
rpc_msg_set_tensor_hash_req request;
request.tensor = serialize_tensor(tensor);
request.offset = offset;
request.hash = hash;
rpc_msg_set_tensor_hash_rsp response;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
GGML_ASSERT(status);
return response.result;
}

static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
Expand Down Expand Up @@ -859,7 +874,7 @@ class rpc_server {
bool free_buffer(const rpc_msg_free_buffer_req & request);
bool buffer_clear(const rpc_msg_buffer_clear_req & request);
bool set_tensor(const std::vector<uint8_t> & input);
bool set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response);
bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
Expand Down Expand Up @@ -1096,18 +1111,10 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
return true;
}

bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response)
bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
{
// serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) |
if (input.size() != sizeof(rpc_tensor) + 16) {
return false;
}
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
uint64_t offset;
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
const uint64_t * hash = (const uint64_t *)(input.data() + sizeof(rpc_tensor) + sizeof(offset));
std::vector<uint8_t> cached_file;
if (!get_cached_file(*hash, cached_file)) {
if (!get_cached_file(request.hash, cached_file)) {
response.result = 0;
return true;
}
Expand All @@ -1120,7 +1127,7 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
ggml_context_ptr ctx_ptr { ggml_init(params) };
GGML_ASSERT(ctx_ptr != nullptr);
ggml_context * ctx = ctx_ptr.get();
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
if (tensor == nullptr) {
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
return false;
Expand All @@ -1132,13 +1139,15 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);

if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
if (request.tensor.data + request.offset < p0
|| request.tensor.data + request.offset >= p1
|| size > (p1 - request.tensor.data - request.offset)) {
GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
__func__, in_tensor->data, offset, size, *hash, p0, p1);
__func__, request.tensor.data, request.offset, size, request.hash, p0, p1);
return false;
}
}
ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
ggml_backend_tensor_set(tensor, cached_file.data(), request.offset, size);
response.result = 1;
return true;
}
Expand Down Expand Up @@ -1498,12 +1507,12 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
break;
}
case RPC_CMD_SET_TENSOR_HASH: {
std::vector<uint8_t> input;
if (!recv_msg(sockfd, input)) {
rpc_msg_set_tensor_hash_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
rpc_msg_set_tensor_hash_rsp response;
if (!server.set_tensor_hash(input, response)) {
if (!server.set_tensor_hash(request, response)) {
return;
}
if (!send_msg(sockfd, &response, sizeof(response))) {
Expand Down Expand Up @@ -1747,6 +1756,8 @@ static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
return (void *)ggml_backend_rpc_add_device;
} else if (std::strcmp(name, "ggml_backend_rpc_buffer_load_tensor") == 0) {
return (void *)ggml_backend_rpc_buffer_load_tensor;
}
return NULL;

Expand Down
41 changes: 34 additions & 7 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
template bool llama_model_loader::get_key<uint64_t> (enum llm_kv kid, uint64_t & result, bool required);
template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);

template<>
Expand Down Expand Up @@ -688,6 +689,10 @@ llama_model_loader::llama_model_loader(

this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (rpc_reg) {
ggml_backend_rpc_buffer_load_tensor_fn = (ggml_backend_rpc_buffer_load_tensor_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_buffer_load_tensor");
}
}

std::string llama_model_loader::get_arch_name() const {
Expand Down Expand Up @@ -881,6 +886,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
}
}

bool llama_model_loader::rpc_load_tensor(struct ggml_tensor * cur) {
if (!ggml_backend_rpc_buffer_load_tensor_fn) {
return false;
}
char hash_key[128];
snprintf(hash_key, sizeof(hash_key), "%s_hash", ggml_get_name(cur));
uint64_t hash_val = 0;
if (!get_key(hash_key, hash_val, false)) {
return false;
}
ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer;
const char * buf_name = ggml_backend_buffer_name(buf);
if (strncmp(buf_name, "RPC", 3) != 0) {
return false;
}
return ggml_backend_rpc_buffer_load_tensor_fn(buf, cur, 0, hash_val);
}

bool llama_model_loader::load_all_data(
struct ggml_context * ctx,
llama_buf_map & bufs,
Expand Down Expand Up @@ -1022,7 +1045,9 @@ bool llama_model_loader::load_all_data(
mmap_used.first = std::min(mmap_used.first, weight->offs);
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
} else {
ggml_backend_tensor_set(cur, data, 0, n_size);
if (!rpc_load_tensor(cur)) {
ggml_backend_tensor_set(cur, data, 0, n_size);
}
}
} else {
const auto & file = files.at(weight->idx);
Expand Down Expand Up @@ -1054,12 +1079,14 @@ bool llama_model_loader::load_all_data(
buffer_idx %= n_buffers;
}
} else {
read_buf.resize(n_size);
file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size);
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
if (!check_tensors && !rpc_load_tensor(cur)) {
read_buf.resize(n_size);
file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size);
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
}
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ struct llama_model_loader {
// for backwards compatibility, does not support ggml-backend
void load_data_for(struct ggml_tensor * cur) const;

typedef bool (*ggml_backend_rpc_buffer_load_tensor_t)(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash);
ggml_backend_rpc_buffer_load_tensor_t ggml_backend_rpc_buffer_load_tensor_fn = nullptr;
bool rpc_load_tensor(struct ggml_tensor * cur);

// Returns false if cancelled by progress_callback
bool load_all_data(
struct ggml_context * ctx,
Expand Down
Loading