Skip to content

Commit

Permalink
Implement --no-byteswap argument to disable byteswapping on big endia…
Browse files Browse the repository at this point in the history
…n platform
  • Loading branch information
AlekseiNikiforovIBM committed Jan 21, 2025
1 parent b03a984 commit 43544fb
Show file tree
Hide file tree
Showing 19 changed files with 86 additions and 46 deletions.
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1390,6 +1390,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_mmap = false;
}
).set_env("LLAMA_ARG_NO_MMAP"));
add_opt(common_arg(
{"--no-byteswap"},
"don't byteswap model data on big endian systems (use if model is byteswapped to big endian in advance)",
[](common_params & params) {
params.no_byteswap = true;
}
).set_env("LLAMA_NO_BYTESWAP"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
Expand Down
13 changes: 8 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_lora_adapter_ptr lora;
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
lora.reset(llama_lora_adapter_init(model, la.path.c_str(), mparams.no_byteswap));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
Expand Down Expand Up @@ -1030,6 +1030,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.no_byteswap = params.no_byteswap;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
Expand Down Expand Up @@ -1357,8 +1358,9 @@ struct llama_model * common_load_model_from_url(
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
/*.no_alloc = */ true,
/*.ctx = */ NULL,
/*.no_byteswap = */ false,
};
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
Expand Down Expand Up @@ -1856,8 +1858,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co

ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false,
/* .ctx = */ &ctx,
/* .no_alloc = */ false,
/* .ctx = */ &ctx,
/* .no_byteswap = */ false,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!ctx_gguf) {
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ struct common_params {
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
bool no_byteswap = false; // skip byteswapping on big endian systems

ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
Expand Down
5 changes: 3 additions & 2 deletions examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
struct ggml_context * ctx_data = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
/*.no_byteswap = */ false,
};

struct gguf_context * ctx = gguf_init_from_file(filename, params);
Expand Down
5 changes: 3 additions & 2 deletions examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) {

static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ ctx_ggml,
/*.no_alloc = */ true,
/*.ctx = */ ctx_ggml,
/*.no_byteswap = */ false,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
Expand Down
5 changes: 3 additions & 2 deletions examples/gguf-hash/gguf-hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
struct ggml_context * ctx_data = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
/*.no_byteswap = */ false,
};

// xxh64 init
Expand Down
10 changes: 6 additions & 4 deletions examples/gguf-split/gguf-split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,9 @@ static void gguf_split(const split_params & split_params) {
struct ggml_context * ctx_meta = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.no_byteswap = */ false,
};

std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
Expand Down Expand Up @@ -426,8 +427,9 @@ static void gguf_merge(const split_params & split_params) {
struct ggml_context * ctx_meta = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
/*.no_byteswap = */ false,
};

if (i_split > 0) {
Expand Down
10 changes: 6 additions & 4 deletions examples/gguf/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) {
// just read tensor info
static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ NULL,
/*.no_alloc = */ false,
/*.ctx = */ NULL,
/*.no_byteswap = */ false,
};

struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
Expand Down Expand Up @@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
struct ggml_context * ctx_data = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
/*.no_byteswap = */ false,
};

struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
Expand Down
5 changes: 3 additions & 2 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1114,8 +1114,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
struct ggml_context * meta = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &meta,
/*.no_alloc = */ true,
/*.ctx = */ &meta,
/*.no_byteswap = */ false,
};

struct gguf_context * ctx = gguf_init_from_file(fname, params);
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/gguf.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ extern "C" {

// if not NULL, create a ggml_context and allocate the tensor data in it
struct ggml_context ** ctx;

bool no_byteswap;
};

GGML_API struct gguf_context * gguf_init_empty(void);
Expand Down
17 changes: 13 additions & 4 deletions ggml/src/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,17 @@ struct gguf_context {

struct gguf_reader {
FILE * file;
bool no_byteswap = false;

gguf_reader(FILE * file) : file(file) {}
gguf_reader(FILE * file, bool v_no_byteswap) : file(file), no_byteswap(v_no_byteswap) {}

template <typename T>
bool read(T & dst) const {
auto res = fread(&dst, 1, sizeof(dst), file);
ggml_convert_from_le(&dst);
if (!no_byteswap) {
ggml_convert_from_le(&dst);
}
return res == sizeof(dst);
}

Expand Down Expand Up @@ -319,7 +323,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
}

struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
const struct gguf_reader gr(file);
const struct gguf_reader gr(file, params.no_byteswap);
struct gguf_context * ctx = new gguf_context;

bool ok = true;
Expand Down Expand Up @@ -1137,6 +1141,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo

struct gguf_writer {
std::vector<int8_t> & buf;
bool no_byteswap = false;

gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}

Expand All @@ -1146,7 +1151,11 @@ struct gguf_writer {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
buf.push_back(reinterpret_cast<const int8_t *>(&val)[sizeof(val) - i - 1]);
if (!no_byteswap) {
buf.push_back(reinterpret_cast<const int8_t *>(&val)[sizeof(val) - i - 1]);
} else {
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
}
#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#error Unexpected or undefined __BYTE_ORDER__
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
Expand Down Expand Up @@ -1317,7 +1326,7 @@ struct gguf_writer {

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
auto byteswap = ggml_get_type_traits(info.t.type)->byteswap;
if (byteswap != nullptr) {
if (byteswap != nullptr && !no_byteswap) {
byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type));
}
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
Expand Down
4 changes: 3 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ extern "C" {
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool no_byteswap; // don't do byteswap, load pre-byteswapped big endian model on big endian system
};

// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
Expand Down Expand Up @@ -518,7 +519,8 @@ extern "C" {
// TODO: rename to llama_adapter_lora_init
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
struct llama_model * model,
const char * path_lora);
const char * path_lora,
bool no_byteswap);

// Add a loaded LoRA adapter to given context
// This will not modify model's weight
Expand Down
11 changes: 6 additions & 5 deletions src/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,14 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
delete adapter;
}

static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter, bool no_byteswap) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

ggml_context * ctx_init;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &ctx_init,
/* .no_alloc = */ true,
/* .ctx = */ &ctx_init,
/* .no_byteswap = */ no_byteswap,
};

gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
Expand Down Expand Up @@ -330,11 +331,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}

struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora, bool no_byteswap) {
struct llama_lora_adapter * adapter = new llama_lora_adapter();

try {
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
llama_lora_adapter_init_impl(*model, path_lora, *adapter, no_byteswap);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
Expand Down
19 changes: 11 additions & 8 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);

llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p, bool no_byteswap) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
Expand All @@ -380,8 +380,9 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,

struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
/*.no_byteswap = */ no_byteswap,
};

meta.reset(gguf_init_from_file(fname.c_str(), params));
Expand Down Expand Up @@ -433,8 +434,9 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split);

struct gguf_init_params split_params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
/*.no_byteswap = */ no_byteswap,
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) };
if (!ctx_gguf) {
Expand Down Expand Up @@ -582,8 +584,9 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
use_mmap = false;
}

this->use_mmap = use_mmap;
this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
this->no_byteswap = no_byteswap;
}

std::string llama_model_loader::get_arch_name() const {
Expand Down Expand Up @@ -928,7 +931,7 @@ bool llama_model_loader::load_all_data(

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
if (byteswap != nullptr) {
if (byteswap != nullptr && !no_byteswap) {
byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
}
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
Expand Down Expand Up @@ -964,7 +967,7 @@ bool llama_model_loader::load_all_data(

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
if (byteswap != nullptr) {
if (byteswap != nullptr && !no_byteswap) {
byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type));
}
#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
Expand Down
3 changes: 2 additions & 1 deletion src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ struct llama_model_loader {

bool use_mmap = false;
bool check_tensors;
bool no_byteswap = false;

llama_files files;
llama_ftype ftype;
Expand All @@ -90,7 +91,7 @@ struct llama_model_loader {
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;

llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p, bool no_byteswap);

template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
Expand Down
1 change: 1 addition & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1998,6 +1998,7 @@ struct llama_model_params llama_model_default_params() {
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.no_byteswap =*/ false,
};

#ifdef GGML_USE_METAL
Expand Down
2 changes: 1 addition & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides, /*no_byteswap*/ false);
ml.init_mappings(false); // no prefetching

llama_model model;
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
model.t_start_us = ggml_time_us();

try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides, params.no_byteswap);

model.hparams.vocab_only = params.vocab_only;

Expand Down
Loading

0 comments on commit 43544fb

Please sign in to comment.