ggml-org · AlekseiNikiforovIBM · Oct 23, 2024 · Oct 24, 2024 · Jan 9, 2025 · Jan 10, 2025
diff --git a/common/common.cpp b/common/common.cpp
@@ -1418,8 +1418,9 @@ struct llama_model * common_load_model_from_url(
     int n_split = 0;
     {
         struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
+            /*.no_alloc           = */ true,
+            /*.ctx                = */ NULL,
+            /*.allow_byteswapping = */ true,
         };
         auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
         if (!ctx_gguf) {
@@ -2063,8 +2064,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ false,
-        /* .ctx      = */ &ctx,
+        /* .no_alloc           = */ false,
+        /* .ctx                = */ &ctx,
+        /* .allow_byteswapping = */ true,
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
     if (!ctx_gguf) {

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
         struct ggml_context * ctx_data = NULL;
 
         struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ &ctx_data,
+            /*.no_alloc           = */ false,
+            /*.ctx                = */ &ctx_data,
+            /*.allow_byteswapping = */ true,
         };
 
         struct gguf_context * ctx = gguf_init_from_file(filename, params);

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
@@ -48,8 +48,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
 
 static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ ctx_ggml,
+        /*.no_alloc           = */ true,
+        /*.ctx                = */ ctx_ggml,
+        /*.allow_byteswapping = */ true,
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
     if (!ctx_gguf) {

diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
@@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     struct ggml_context * ctx_data = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
+        /*.no_alloc           = */ false,
+        /*.ctx                = */ &ctx_data,
+        /*.allow_byteswapping = */ true,
     };
 
     // xxh64 init

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
@@ -328,14 +328,20 @@ struct split_strategy {
                 const char * t_name = gguf_get_tensor_name(ctx_out, i);
                 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
                 auto n_bytes = ggml_nbytes(t);
+                auto n_elements = ggml_nelements(t) / ggml_blck_size(t->type);
                 read_buf.resize(n_bytes);
 
                 // calculate offset
                 auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
                 auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
 
+                ggml_byteswap_t byteswap_func = nullptr;
+                if (gguf_needs_byteswap(ctx_gguf)) {
+                    byteswap_func = ggml_get_type_traits(t->type)->byteswap;
+                }
+
                 // copy tensor from input to output file
-                copy_file_to_file(f_input, fout, offset, n_bytes);
+                copy_file_to_file(f_input, fout, offset, n_bytes, n_elements, byteswap_func);
                 zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
             }
 
@@ -346,13 +352,18 @@ struct split_strategy {
         }
     }
 
-    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
+    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len, const size_t elements, ggml_byteswap_t byteswap_func) {
         // TODO: detect OS and use copy_file_range() here for better performance
         if (read_buf.size() < len) {
             read_buf.resize(len);
         }
         f_in.seekg(in_offset);
         f_in.read((char *)read_buf.data(), len);
+
+        if (byteswap_func != nullptr) {
+            byteswap_func(read_buf.data(), elements);
+        }
+
         f_out.write((const char *)read_buf.data(), len);
     }
 };
@@ -361,8 +372,9 @@ static void gguf_split(const split_params & split_params) {
     struct ggml_context * ctx_meta = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx_meta,
+        /*.no_alloc           = */ true,
+        /*.ctx                = */ &ctx_meta,
+        /*.allow_byteswapping = */ true,
     };
 
     std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
@@ -426,8 +438,9 @@ static void gguf_merge(const split_params & split_params) {
         struct ggml_context * ctx_meta = NULL;
 
         struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx_meta,
+            /*.no_alloc           = */ true,
+            /*.ctx                = */ &ctx_meta,
+            /*.allow_byteswapping = */ true,
         };
 
         if (i_split > 0) {
@@ -541,6 +554,13 @@ static void gguf_merge(const split_params & split_params) {
             f_input.seekg(offset);
             f_input.read((char *)read_data.data(), n_bytes);
 
+            if (gguf_needs_byteswap(ctx_gguf)) {
+                auto byteswap = ggml_get_type_traits(t->type)->byteswap;
+                if (byteswap != nullptr) {
+                    byteswap(read_data.data(), ggml_nelements(t) / ggml_blck_size(t->type));
+                }
+            }
+
             // write tensor data + padding
             fout.write((const char *)read_data.data(), n_bytes);
             zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
@@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) {
 // just read tensor info
 static bool gguf_ex_read_0(const std::string & fname) {
     struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ NULL,
+        /*.no_alloc           = */ false,
+        /*.ctx                = */ NULL,
+        /*.allow_byteswapping = */ true,
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
@@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
     struct ggml_context * ctx_data = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
+        /*.no_alloc           = */ false,
+        /*.ctx                = */ &ctx_data,
+        /*.allow_byteswapping = */ true,
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -1122,8 +1122,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     struct ggml_context * meta = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &meta,
+        /*.no_alloc           = */ true,
+        /*.ctx                = */ &meta,
+        /*.allow_byteswapping = */ true,
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname, params);

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -2144,6 +2144,7 @@ extern "C" {
 #endif
     typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+    typedef void (*ggml_byteswap_t)  (      void  * GGML_RESTRICT buffer, size_t elements);
 
     struct ggml_type_traits {
         const char             * type_name;
@@ -2153,6 +2154,7 @@ extern "C" {
         bool                     is_quantized;
         ggml_to_float_t          to_float;
         ggml_from_float_t        from_float_ref;
+        ggml_byteswap_t          byteswap;
     };
 
     GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);

diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
@@ -74,6 +74,8 @@ extern "C" {
 
         // if not NULL, create a ggml_context and allocate the tensor data in it
         struct ggml_context ** ctx;
+
+        bool allow_byteswapping;
     };
 
     GGML_API struct gguf_context * gguf_init_empty(void);
@@ -197,6 +199,9 @@ extern "C" {
     // writes the meta data to pointer "data"
     GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
 
+    // returns true if gguf file needs byteswapping when reading. byteswapping for writing not implemented
+    GGML_API bool gguf_needs_byteswap(const struct gguf_context * ctx);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -28,6 +28,14 @@
 #include <immintrin.h>
 #endif
 
+#if defined(__gnu_linux__)
+#include <byteswap.h>
+#else // defined(__gnu_linux__)
+#define bswap_16(x) (x)
+#define bswap_32(x) (x)
+#define bswap_64(x) (x)
+#endif // defined(__gnu_linux__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -553,12 +561,47 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 
+// endianness conversion
+static inline void ggml_bswap16(void * value) {
+    *((uint16_t*)value) = bswap_16(*((uint16_t*)value));
+}
+
+static inline void ggml_bswap32(void * value) {
+    *((uint32_t*)value) = bswap_32(*((uint32_t*)value));
+}
+
+static inline void ggml_bswap64(void * value) {
+    *((uint64_t*)value) = bswap_64(*((uint64_t*)value));
+}
+
 #ifdef __cplusplus
 }
 #endif
 
 #ifdef __cplusplus
 #include <vector>
+#include <type_traits>
+
+template <typename T, std::enable_if_t<sizeof(T) == 1, int> = 0>
+static inline void ggml_bswap(T * value)
+{
+    GGML_UNUSED(value);
+}
+
+template <typename T, std::enable_if_t<sizeof(T) == 2, int> = 0>
+static inline void ggml_bswap(T * value) {
+    ggml_bswap16(value);
+}
+
+template <typename T, std::enable_if_t<sizeof(T) == 4, int> = 0>
+static inline void ggml_bswap(T * value) {
+    ggml_bswap32(value);
+}
+
+template <typename T, std::enable_if_t<sizeof(T) == 8, int> = 0>
+static inline void ggml_bswap(T * value) {
+    ggml_bswap64(value);
+}
 
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);