From b4e5549472907b73afb1220aa17bfcf32b125d4f Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Thu, 24 Apr 2025 17:02:59 +0300 Subject: [PATCH 1/3] llama : try loading tensors with pre-computed hashes Export a function for loading tensor data with pre-computed hash from the RPC backend and use it in the model loader when available --- examples/gguf-hash/gguf-hash.cpp | 39 ++++++++++++++++++++++++++++++-- ggml/include/ggml-rpc.h | 2 ++ ggml/src/ggml-rpc/ggml-rpc.cpp | 17 ++++++++++++++ src/llama-model-loader.cpp | 27 +++++++++++++++++++++- src/llama-model-loader.h | 4 ++++ 5 files changed, 86 insertions(+), 3 deletions(-) diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122f573..630e40d6a34a1 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -55,6 +55,7 @@ typedef enum { struct hash_params { std::string input; + bool fnv = false; bool xxh64 = false; bool sha1 = false; bool sha256 = false; @@ -103,6 +104,7 @@ static void hash_print_usage(const char * executable) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); + printf(" --fnv use FNV-1a hash\n"); printf(" --xxh64 use xxh64 hash\n"); printf(" --sha1 use sha1 hash\n"); printf(" --sha256 use sha256 hash\n"); @@ -131,6 +133,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par exit(0); } + if (arg == "--fnv") { + arg_found = true; + params.fnv = true; + } + if (arg == "--xxh64") { arg_found = true; params.xxh64 = true; @@ -283,6 +290,18 @@ static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char u uuid[ 8] |= (0x8 << 4); } +// Computes FNV-1a hash of the data +static uint64_t fnv_hash(const uint8_t * data, size_t len) { + const uint64_t fnv_prime = 0x100000001b3ULL; + uint64_t hash = 0xcbf29ce484222325ULL; + + for (size_t i = 0; i < len; ++i) { + hash ^= data[i]; + hash *= fnv_prime; + } + return hash; +} + static hash_exit_code_t gguf_hash(const hash_params & hash_params) { const std::string & fname = hash_params.input; struct ggml_context * ctx_data = NULL; @@ -326,7 +345,11 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace)); } + struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + + gguf_set_kv(ctx_out, ctx); + const int n_tensors = gguf_get_n_tensors(ctx); bool tensor_layer_in_manifest = false; bool model_in_manifest = false; @@ -335,10 +358,19 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + gguf_add_tensor(ctx_out, cur); auto n_bytes = ggml_nbytes(cur); auto *raw_data = cur->data; const std::string tensor_layer_name = fname + ":" + name; + if (hash_params.fnv) { + uint64_t hash = fnv_hash((const uint8_t *)raw_data, n_bytes); + printf("%016lx %s\n", hash, tensor_layer_name.c_str()); + char hash_key[128]; + snprintf(hash_key, sizeof(hash_key), "%s_hash", name); + gguf_set_val_u64(ctx_out, hash_key, hash); + } + if (hash_params.xxh64) { if (!hash_params.no_layer) { @@ -580,6 +612,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { } } + auto fname_out = fname + ".rpc"; + gguf_write_to_file(ctx_out, fname_out.c_str(), false); + gguf_free(ctx_out); ggml_free(ctx_data); gguf_free(ctx); @@ -663,7 +698,7 @@ int main(int argc, const char ** argv) { // Autoselect the highest security hash if manifest is provided but // the user has not specifically defined the hash they care about - if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { + if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { // User has not selected a specific value, pick most secure hash if (manifest_check.sha256) { params.sha256 = true; @@ -680,7 +715,7 @@ int main(int argc, const char ** argv) { } // By default if no swich argument provided, assume xxh64 - if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { + if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) { params.xxh64 = true; } diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index 1e674112767c9..f035552f8e67c 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -27,6 +27,8 @@ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, cons GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void); GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); +GGML_BACKEND_API bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, + size_t offset, uint64_t hash); #ifdef __cplusplus } diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 140a775f9806f..e0ab38b33f48b 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -597,6 +597,21 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con return response.result; } +bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + rpc_tensor rpc_tensor = serialize_tensor(tensor); + // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) + size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t); + std::vector input(input_size, 0); + memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); + memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); + memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash)); + rpc_msg_set_tensor_hash_rsp response; + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response)); + GGML_ASSERT(status); + return response.result; +} + static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value}; @@ -1747,6 +1762,8 @@ static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) { if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) { return (void *)ggml_backend_rpc_add_device; + } else if (std::strcmp(name, "ggml_backend_rpc_buffer_load_tensor") == 0) { + return (void *)ggml_backend_rpc_buffer_load_tensor; } return NULL; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ea73a8a7ba944..32bc457458263 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -376,6 +376,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key (enum llm_kv kid, bool & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, float & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, uint32_t & result, bool required); + template bool llama_model_loader::get_key (enum llm_kv kid, uint64_t & result, bool required); template bool llama_model_loader::get_key(enum llm_kv kid, std::string & result, bool required); template<> @@ -688,6 +689,10 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (rpc_reg) { + ggml_backend_rpc_buffer_load_tensor_fn = (ggml_backend_rpc_buffer_load_tensor_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_buffer_load_tensor"); + } } std::string llama_model_loader::get_arch_name() const { @@ -881,6 +886,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { } } +bool llama_model_loader::rpc_load_tensor(struct ggml_tensor * cur) { + if (!ggml_backend_rpc_buffer_load_tensor_fn) { + return false; + } + char hash_key[128]; + snprintf(hash_key, sizeof(hash_key), "%s_hash", ggml_get_name(cur)); + uint64_t hash_val = 0; + if (!get_key(hash_key, hash_val, false)) { + return false; + } + ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer; + const char * buf_name = ggml_backend_buffer_name(buf); + if (strncmp(buf_name, "RPC", 3) != 0) { + return false; + } + return ggml_backend_rpc_buffer_load_tensor_fn(buf, cur, 0, hash_val); +} + bool llama_model_loader::load_all_data( struct ggml_context * ctx, llama_buf_map & bufs, @@ -1022,7 +1045,9 @@ bool llama_model_loader::load_all_data( mmap_used.first = std::min(mmap_used.first, weight->offs); mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { - ggml_backend_tensor_set(cur, data, 0, n_size); + if (!rpc_load_tensor(cur)) { + ggml_backend_tensor_set(cur, data, 0, n_size); + } } } else { const auto & file = files.at(weight->idx); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 0f52b011b6986..feafead804c34 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -155,6 +155,10 @@ struct llama_model_loader { // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const; + typedef bool (*ggml_backend_rpc_buffer_load_tensor_t)(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash); + ggml_backend_rpc_buffer_load_tensor_t ggml_backend_rpc_buffer_load_tensor_fn = nullptr; + bool rpc_load_tensor(struct ggml_tensor * cur); + // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, From 25909ca56dc07bc474aa254cd0b81d535a3b8dfc Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 28 Apr 2025 15:03:02 +0300 Subject: [PATCH 2/3] add rpc_msg_set_tensor_hash_req --- ggml/src/ggml-rpc/ggml-rpc.cpp | 60 +++++++++++++++------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index e0ab38b33f48b..4d38495b3e7fa 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -151,6 +151,12 @@ struct rpc_msg_buffer_clear_req { uint8_t value; }; +struct rpc_msg_set_tensor_hash_req { + rpc_tensor tensor; + uint64_t offset; + uint64_t hash; +}; + struct rpc_msg_set_tensor_hash_rsp { uint8_t result; }; @@ -543,15 +549,12 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; rpc_tensor rpc_tensor = serialize_tensor(tensor); if (size > HASH_THRESHOLD) { - // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) - size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t); - std::vector input(input_size, 0); - uint64_t hash = fnv_hash((const uint8_t*)data, size); - memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); - memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); - memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash)); + rpc_msg_set_tensor_hash_req request; + request.tensor = serialize_tensor(tensor); + request.offset = offset; + request.hash = fnv_hash((const uint8_t*)data, size); rpc_msg_set_tensor_hash_rsp response; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response)); + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response)); GGML_ASSERT(status); if (response.result) { // the server has the same data, no need to send it @@ -599,15 +602,12 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; - rpc_tensor rpc_tensor = serialize_tensor(tensor); - // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) - size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t); - std::vector input(input_size, 0); - memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); - memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); - memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash)); + rpc_msg_set_tensor_hash_req request; + request.tensor = serialize_tensor(tensor); + request.offset = offset; + request.hash = hash; rpc_msg_set_tensor_hash_rsp response; - bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response)); + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response)); GGML_ASSERT(status); return response.result; } @@ -874,7 +874,7 @@ class rpc_server { bool free_buffer(const rpc_msg_free_buffer_req & request); bool buffer_clear(const rpc_msg_buffer_clear_req & request); bool set_tensor(const std::vector & input); - bool set_tensor_hash(const std::vector & input, rpc_msg_set_tensor_hash_rsp & response); + bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response); bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response); bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response); bool graph_compute(const std::vector & input, rpc_msg_graph_compute_rsp & response); @@ -1111,18 +1111,10 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector & data) { return true; } -bool rpc_server::set_tensor_hash(const std::vector & input, rpc_msg_set_tensor_hash_rsp & response) +bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response) { - // serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) | - if (input.size() != sizeof(rpc_tensor) + 16) { - return false; - } - const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); - uint64_t offset; - memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); - const uint64_t * hash = (const uint64_t *)(input.data() + sizeof(rpc_tensor) + sizeof(offset)); std::vector cached_file; - if (!get_cached_file(*hash, cached_file)) { + if (!get_cached_file(request.hash, cached_file)) { response.result = 0; return true; } @@ -1135,7 +1127,7 @@ bool rpc_server::set_tensor_hash(const std::vector & input, rpc_msg_set ggml_context_ptr ctx_ptr { ggml_init(params) }; GGML_ASSERT(ctx_ptr != nullptr); ggml_context * ctx = ctx_ptr.get(); - ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor); + ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor); if (tensor == nullptr) { GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__); return false; @@ -1147,13 +1139,15 @@ bool rpc_server::set_tensor_hash(const std::vector & input, rpc_msg_set const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); - if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + if (request.tensor.data + request.offset < p0 + || request.tensor.data + request.offset >= p1 + || size > (p1 - request.tensor.data - request.offset)) { GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n", __func__, in_tensor->data, offset, size, *hash, p0, p1); return false; } } - ggml_backend_tensor_set(tensor, cached_file.data(), offset, size); + ggml_backend_tensor_set(tensor, cached_file.data(), request.offset, size); response.result = 1; return true; } @@ -1513,12 +1507,12 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir, break; } case RPC_CMD_SET_TENSOR_HASH: { - std::vector input; - if (!recv_msg(sockfd, input)) { + rpc_msg_set_tensor_hash_req request; + if (!recv_msg(sockfd, &request, sizeof(request))) { return; } rpc_msg_set_tensor_hash_rsp response; - if (!server.set_tensor_hash(input, response)) { + if (!server.set_tensor_hash(request, response)) { return; } if (!send_msg(sockfd, &response, sizeof(response))) { From dc89d36802b1bfbaeda9dcf984287a32fbf3a2bf Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Tue, 29 Apr 2025 13:22:37 +0300 Subject: [PATCH 3/3] handle no-mmap as well --- examples/gguf-hash/gguf-hash.cpp | 9 ++++++++- ggml/src/ggml-rpc/ggml-rpc.cpp | 2 +- src/llama-model-loader.cpp | 14 ++++++++------ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 630e40d6a34a1..6e966a3c290dd 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -365,7 +365,14 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { if (hash_params.fnv) { uint64_t hash = fnv_hash((const uint8_t *)raw_data, n_bytes); - printf("%016lx %s\n", hash, tensor_layer_name.c_str()); + char hex_result[17]; + for (int offset = 0; offset < 8; offset++) { + unsigned int shift_bits_by = (8 * (8 - offset - 1)); + snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff); + } + + printf("%-8s %-s %s\n", "fnv", hex_result, tensor_layer_name.c_str()); + char hash_key[128]; snprintf(hash_key, sizeof(hash_key), "%s_hash", name); gguf_set_val_u64(ctx_out, hash_key, hash); diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 4d38495b3e7fa..b546b9a7548de 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1143,7 +1143,7 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp || request.tensor.data + request.offset >= p1 || size > (p1 - request.tensor.data - request.offset)) { GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n", - __func__, in_tensor->data, offset, size, *hash, p0, p1); + __func__, request.tensor.data, request.offset, size, request.hash, p0, p1); return false; } } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 32bc457458263..2849f5f4cd658 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1079,12 +1079,14 @@ bool llama_model_loader::load_all_data( buffer_idx %= n_buffers; } } else { - read_buf.resize(n_size); - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); - ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); - if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { - throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + if (!check_tensors && !rpc_load_tensor(cur)) { + read_buf.resize(n_size); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } } } }