sync : ggml (ggml_scale, ggml_row_size, etc.) (ggerganov#1677)

* sync : ggml * sync : llama.cpp * talk-llama : fix obsolete param * ggml-alloc : fix ggml_tallocr_is_own * talk.wasm : update to new ggml * ggml : fix type punning in ggml_scale * ggml : cuda jetson + arm quants warnings
iThalay · Sep 23, 2024 · f0fe044 · f0fe044
1 parent 9b1a9f1
commit f0fe044
Show file tree

Hide file tree

Showing 18 changed files with 3,520 additions and 1,578 deletions.
diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp
diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h
@@ -39,10 +39,11 @@
 
 #define LLAMA_MAX_RNG_STATE (64*1024)
 
+#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 
 #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3
 
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -126,7 +127,7 @@ extern "C" {
  bool sorted;
  } llama_token_data_array;
 
- typedef void (*llama_progress_callback)(float progress, void *ctx);
+ typedef bool (*llama_progress_callback)(float progress, void *ctx);
 
  // Input data for llama_decode
  // A llama_batch object can contain input about one or many sequences
@@ -158,16 +159,38 @@ extern "C" {
  llama_seq_id all_seq_id; // used if seq_id == NULL
  } llama_batch;
 
+ enum llama_model_kv_override_type {
+ LLAMA_KV_OVERRIDE_INT,
+ LLAMA_KV_OVERRIDE_FLOAT,
+ LLAMA_KV_OVERRIDE_BOOL,
+ };
+
+ struct llama_model_kv_override {
+ char key[128];
+ enum llama_model_kv_override_type tag;
+ union {
+ int64_t int_value;
+ double float_value;
+ bool bool_value;
+ };
+ };
+
  struct llama_model_params {
  int32_t n_gpu_layers; // number of layers to store in VRAM
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
- // called with a progress value between 0 and 1, pass NULL to disable
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+ // If the provided progress_callback returns true, model loading continues.
+ // If it returns false, model loading is immediately aborted.
  llama_progress_callback progress_callback;
+
  // context pointer passed to the progress callback
  void * progress_callback_user_data;
 
+ // override key-value pairs of the model meta data
+ const struct llama_model_kv_override * kv_overrides;
+
  // Keep the booleans together to avoid misalignment during copy-by-value.
  bool vocab_only; // only load the vocabulary, no weights
  bool use_mmap; // use mmap if possible
@@ -185,17 +208,20 @@ extern "C" {
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
  float rope_freq_base; // RoPE base frequency, 0 = from model
  float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
- float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
+ float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
  float yarn_attn_factor; // YaRN magnitude scaling factor
  float yarn_beta_fast; // YaRN low correction dim
  float yarn_beta_slow; // YaRN high correction dim
  uint32_t yarn_orig_ctx; // YaRN original context size
 
+ enum ggml_type type_k; // data type for K cache
+ enum ggml_type type_v; // data type for V cache
+
  // Keep the booleans together to avoid misalignment during copy-by-value.
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
- bool f16_kv;  // use fp16 for KV cache, fp32 otherwise
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
- bool embedding; // embedding mode only
+ bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+ bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+ bool embedding; // embedding mode only
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
  };
 
  // model quantization parameters
@@ -290,7 +316,9 @@ extern "C" {
 
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 
- LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
+ // TODO: become more consistent with returned int types across the API
+ LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
+ LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
 
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
 
@@ -301,6 +329,23 @@ extern "C" {
  // Get the model's RoPE frequency scaling factor
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
+ // Functions to access the model's GGUF metadata scalar values
+ // - The functions return the length of the string on success, or -1 on failure
+ // - The output string is always null-terminated and cleared on failure
+ // - GGUF array values are not supported by these functions
+
+ // Get metadata value as a string by key name
+ LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+
+ // Get the number of metadata key/value pairs
+ LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+
+ // Get metadata key name by index
+ LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
+ // Get metadata value as a string by index
+ LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
  // Get a string describing the model type
  LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
 
@@ -344,9 +389,60 @@ extern "C" {
  // KV cache
  //
 
- // Returns the number of tokens in the KV cache
- LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
- "avoid using this, it will be removed in the future, instead - count the tokens in user code");
+ // Information associated with an individual cell in the KV cache view.
+ struct llama_kv_cache_view_cell {
+ // The position for this cell. Takes KV cache shifts into account.
+ // May be negative if the cell is not populated.
+ llama_pos pos;
+ };
+
+ // An updateable view of the KV cache.
+ struct llama_kv_cache_view {
+ // Number of KV cache cells. This will be the same as the context size.
+ int32_t n_cells;
+
+ // Maximum number of sequences that can exist in a cell. It's not an error
+ // if there are more sequences in a cell than this value, however they will
+ // not be visible in the view cells_sequences.
+ int32_t n_max_seq;
+
+ // Number of tokens in the cache. For example, if there are two populated
+ // cells, the first with 1 sequence id in it and the second with 2 sequence
+ // ids then you'll have 3 tokens.
+ int32_t token_count;
+
+ // Number of populated cache cells.
+ int32_t used_cells;
+
+ // Maximum contiguous empty slots in the cache.
+ int32_t max_contiguous;
+
+ // Index to the start of the max_contiguous slot range. Can be negative
+ // when cache is full.
+ int32_t max_contiguous_idx;
+
+ // Information for an individual cell.
+ struct llama_kv_cache_view_cell * cells;
+
+ // The sequences for each cell. There will be n_max_seq items per cell.
+ llama_seq_id * cells_sequences;
+ };
+
+ // Create an empty KV cache view. (use only for debugging purposes)
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+
+ // Free a KV cache view. (use only for debugging purposes)
+ LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+ // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+ LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+ // Returns the number of tokens in the KV cache (slow, use only for debug)
+ // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+ // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+ LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
 
  // Clear the KV cache
  LLAMA_API void llama_kv_cache_clear(
@@ -517,6 +613,12 @@ extern "C" {
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
 
+ // Returns -1 if unknown, 1 for true or 0 for false.
+ LLAMA_API int llama_add_bos_token(const struct llama_model * model);
+
+ // Returns -1 if unknown, 1 for true or 0 for false.
+ LLAMA_API int llama_add_eos_token(const struct llama_model * model);
+
  // codellama infill tokens
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
  LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle

diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp
@@ -282,7 +282,6 @@ int main(int argc, char ** argv) {
  // tune these to your liking
  lcparams.n_ctx = 2048;
  lcparams.seed = 1;
- lcparams.f16_kv = true;
  lcparams.n_threads = params.n_threads;
 
  struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);

diff --git a/examples/talk.wasm/gpt-2.cpp b/examples/talk.wasm/gpt-2.cpp
@@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  const int n_ctx = hparams.n_ctx;
  const int n_vocab = hparams.n_vocab;
 
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
 
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
+ ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
 
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
 
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
 
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*(ggml_row_size(wtype,  3*n_embd*n_embd)); // c_attn_attn_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));  // c_attn_attn_b
 
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));   // c_attn_proj_w
- ctx_size += n_layer*(   n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
 
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype,  4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));  // c_mlp_fc_b
 
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype,  4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));   // c_mlp_proj_b
 
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
 
  ctx_size += (6 + 12*n_layer)*256; // object overhead
 
@@ -524,8 +524,7 @@ bool gpt2_eval(
  struct ggml_tensor * KQ_scaled =
  ggml_scale(ctx0,
  KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ 1.0f/sqrt(float(n_embd)/n_head));
 
  // KQ_masked = mask_past(KQ_scaled)
  // [n_past + N, N, 12]

diff --git a/examples/talk/gpt-2.cpp b/examples/talk/gpt-2.cpp
@@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  const int n_ctx = hparams.n_ctx;
  const int n_vocab = hparams.n_vocab;
 
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
 
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
+ ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+ ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
 
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
 
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
 
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ ctx_size += n_layer*(ggml_row_size(wtype,  3*n_embd*n_embd)); // c_attn_attn_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));  // c_attn_attn_b
 
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));   // c_attn_proj_w
- ctx_size += n_layer*(   n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
 
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ ctx_size += n_layer*(ggml_row_size(wtype,  4*n_embd*n_embd)); // c_mlp_fc_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));  // c_mlp_fc_b
 
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ ctx_size += n_layer*(ggml_row_size(wtype,  4*n_embd*n_embd)); // c_mlp_proj_w
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));   // c_mlp_proj_b
 
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
 
  ctx_size += (6 + 12*n_layer)*256; // object overhead
 
@@ -525,8 +525,7 @@ bool gpt2_eval(
  struct ggml_tensor * KQ_scaled =
  ggml_scale(ctx0,
  KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
- );
+ 1.0f/sqrt(float(n_embd)/n_head));
 
  // KQ_masked = mask_past(KQ_scaled)
  // [n_past + N, N, 12]

diff --git a/extra/sync-llama.sh b/extra/sync-llama.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
+cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
+cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
diff --git a/ggml-alloc.c b/ggml-alloc.c
@@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t
 
 // check if a tensor is allocated by this buffer
 static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
- return tensor->buffer == alloc->buffer;
+ return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
 }
 
 static bool ggml_is_view(struct ggml_tensor * t) {
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
  if (update_backend) {
  view->backend = view->view_src->backend;
  }
- view->buffer = view->view_src->buffer;
+ // views are initialized in the alloc buffer rather than the view_src buffer
+ view->buffer = alloc->buffer;
  view->data = (char *)view->view_src->data + view->view_offs;
 
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
  assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
 
  if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
 }
 
 void ggml_allocr_free(ggml_allocr_t alloc) {
+ if (alloc == NULL) {
+ return;
+ }
+
  ggml_gallocr_free(alloc->galloc);
  ggml_tallocr_free(alloc->talloc);
  free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
  }
 
  if (nbytes == 0) {
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+ // all the tensors in the context are already allocated
  return NULL;
  }
 
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
  } else {
  ggml_backend_view_init(buffer, t);
  }
+ } else {
+ if (t->view_src != NULL) {
+ // view of a pre-allocated tensor
+ ggml_backend_view_init(buffer, t);
+ }
  }
  }