Merge commit '584d674be622fbf1578694ada6e62eebedbfd377' into concedo_…

…experimental # Conflicts: # .github/workflows/nix-flake-update.yml # Makefile # Package.swift # ggml-cuda.cu # tests/test-quantize-fns.cpp
LostRuins · Jan 14, 2024 · dc7bc0c · dc7bc0c
2 parents bd77a48 + 584d674
commit dc7bc0c
Show file tree

Hide file tree

Showing 31 changed files with 2,133 additions and 381 deletions.
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,7 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
+/imatrix
 /infill
 /libllama.so
 /llama-bench

diff --git a/common/common.cpp b/common/common.cpp
@@ -631,6 +631,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
  break;
  }
  params.ppl_stride = std::stoi(argv[i]);
+ } else if (arg == "-ptc" || arg == "--print-token-count") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_print = std::stoi(argv[i]);
  } else if (arg == "--ppl-output-type") {
  if (++i >= argc) {
  invalid_param = true;
@@ -813,7 +819,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
  printf("\n");
  printf("options:\n");
  printf(" -h, --help show this help message and exit\n");
- printf(" --version show version and build info\n");
+ printf(" --version show version and build info\n");
  printf(" -i, --interactive run in interactive mode\n");
  printf(" --interactive-first run in interactive mode and wait for input right away\n");
  printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -910,7 +916,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
  printf(" number of layers to store in VRAM\n");
  printf(" -ngld N, --n-gpu-layers-draft N\n");
  printf(" number of layers to store in VRAM for the draft model\n");
- printf(" -ts SPLIT --tensor-split SPLIT\n");
+ printf(" -ts SPLIT, --tensor-split SPLIT\n");
  printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
 #ifdef GGML_USE_CUBLAS
@@ -945,6 +951,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
  printf(" --override-kv KEY=TYPE:VALUE\n");
  printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
  printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+ printf(" -ptc N, --print-token-count N\n");
+ printf(" print token count every N tokens (default: %d)\n", params.n_print);
  printf("\n");
 #ifndef LOG_DISABLE_LOGS
  log_print_usage();
@@ -1048,6 +1056,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
 }
 
 static ggml_type kv_cache_type_from_str(const std::string & s) {
+ if (s == "f32") {
+ return GGML_TYPE_F32;
+ }
  if (s == "f16") {
  return GGML_TYPE_F16;
  }

diff --git a/common/common.h b/common/common.h
@@ -58,6 +58,7 @@ struct gpt_params {
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
  int32_t grp_attn_n = 1; // group-attention factor
  int32_t grp_attn_w = 512; // group-attention width
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
  float rope_freq_base = 0.0f; // RoPE base frequency
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
  float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
@@ -254,4 +255,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
-
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -817,10 +817,17 @@ def set_gguf_parameters(self):
  hidden_size = self.hparams["hidden_size"]
 
  self.gguf_writer.add_name('persimmon-8b-chat')
+ self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
  self.gguf_writer.add_embedding_length(hidden_size)
  self.gguf_writer.add_block_count(block_count)
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
+
+ # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
+ # than the head size?
+ # ref: https://github.com/ggerganov/llama.cpp/pull/4889
+ # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
+ self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
+
  self.gguf_writer.add_head_count(head_count)
  self.gguf_writer.add_head_count_kv(head_count_kv)
  self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -36,6 +36,7 @@ else()
  add_subdirectory(lookahead)
  add_subdirectory(lookup)
  add_subdirectory(train-text-from-scratch)
+ add_subdirectory(imatrix)
  if (LLAMA_METAL)
  add_subdirectory(metal)
  endif()

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
@@ -245,9 +245,8 @@ static struct lora_data * load_lora(struct lora_info * info) {
  params_ggml.no_alloc = true;
  result->ctx = ggml_init(params_ggml);
 
- uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
  uint32_t magic = file.read_u32();
- if (magic != LLAMA_FILE_MAGIC_LORA) {
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
  die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
  }
  uint32_t version = file.read_u32();

diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET imatrix)
+add_executable(${TARGET} imatrix.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)