Skip to content

Commit

Permalink
update to latest llama.cpp breaking API changes
Browse files Browse the repository at this point in the history
Signed-off-by: mudler <mudler@localai.io>
  • Loading branch information
mudler committed Oct 7, 2023
1 parent 79f9587 commit 231a797
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 101 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/test-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@ jobs:
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y pip wget
- name: Build and test
run: |
set -o pipefail
GPU_TESTS=true BUILD_TYPE=cublas CMAKE_ARGS="-DLLAMA_METAL=OFF -DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" \
make test 2>&1 | tee test_log.log
set +o pipefail
if grep -q "using CUDA for GPU acceleration" test_log.log; then
echo "All good";
echo "GPU was used";
else
echo "No CUDA found";
exit 1;
Expand Down
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.c

## https://github.com/ggerganov/llama.cpp/pull/1902
prepare:
cd llama.cpp && patch -p1 < ../patches/1902-cuda.patch
cd llama.cpp && \
patch -p1 < ../patches/1902-cuda.patch
touch $@

libbinding.a: prepare binding.o llama.cpp/k_quants.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o $(EXTRA_TARGETS)
Expand All @@ -248,4 +249,4 @@ ggllm-test-model.bin:
wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O ggllm-test-model.bin

test: ggllm-test-model.bin libbinding.a
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=ggllm-test-model.bin go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="$(TEST_LABEL)" --flake-attempts 5 -v -r ./...
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=$(abspath ./)/ggllm-test-model.bin go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="$(TEST_LABEL)" -v -r ./...
130 changes: 69 additions & 61 deletions binding.cpp

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,21 @@ void* load_model(const char *fname,
bool mlock,
bool embeddings,
bool mmap,
bool low_vram,
int n_gpu,
int n_batch,
const char *maingpu,
const char *tensorsplit,
bool numa,
float rope_freq_base,
float rope_freq_scale,
bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity
bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool perplexity
);

int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);

int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tokenSize, float * res_embeddings);

void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
void* llama_allocate_params(const char *prompt, int seed, int threads, int batch_threads, int tokens,
int top_k, float top_p, float temp, float repeat_penalty,
int repeat_last_n, bool ignore_eos, bool memory_f16,
int n_batch, int n_keep, const char** antiprompt, int antiprompt_count,
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 92 files
+3 −0 .dockerignore
+63 −40 .github/workflows/build.yml
+8 −1 .gitignore
+64 −43 CMakeLists.txt
+79 −25 Makefile
+6 −3 Package.swift
+21 −15 README.md
+2 −0 common/CMakeLists.txt
+150 −64 common/common.cpp
+21 −7 common/common.h
+37 −37 common/log.h
+1,496 −0 common/train.cpp
+230 −0 common/train.h
+8 −2 convert-baichuan-hf-to-gguf.py
+74 −105 convert-falcon-hf-to-gguf.py
+12 −51 convert-gptneox-hf-to-gguf.py
+130 −0 convert-persimmon-to-gguf.py
+318 −0 convert-refact-hf-to-gguf.py
+8 −54 convert-starcoder-hf-to-gguf.py
+8 −23 convert.py
+1 −1 docs/BLIS.md
+4 −0 examples/CMakeLists.txt
+79 −147 examples/baby-llama/baby-llama.cpp
+5 −0 examples/batched/CMakeLists.txt
+44 −0 examples/batched/README.md
+255 −0 examples/batched/batched.cpp
+4 −3 examples/beam-search/beam-search.cpp
+4 −4 examples/chat-persistent.sh
+10 −10 examples/embd-input/embd-input-lib.cpp
+1 −1 examples/embd-input/embd-input-test.cpp
+11 −10 examples/embedding/embedding.cpp
+5 −0 examples/export-lora/CMakeLists.txt
+26 −0 examples/export-lora/README.md
+474 −0 examples/export-lora/export-lora.cpp
+5 −0 examples/finetune/CMakeLists.txt
+90 −0 examples/finetune/README.md
+489 −0 examples/finetune/convert-finetune-checkpoint-to-gguf.py
+1,940 −0 examples/finetune/finetune.cpp
+8 −0 examples/infill/CMakeLists.txt
+41 −0 examples/infill/README.md
+769 −0 examples/infill/infill.cpp
+1 −1 examples/jeopardy/README.md
+115 −50 examples/llama-bench/llama-bench.cpp
+10 −0 examples/main-cmake-pkg/CMakeLists.txt
+2 −2 examples/main/README.md
+43 −44 examples/main/main.cpp
+8 −0 examples/parallel/CMakeLists.txt
+3 −0 examples/parallel/README.md
+426 −0 examples/parallel/parallel.cpp
+70 −46 examples/perplexity/perplexity.cpp
+9 −8 examples/quantize-stats/quantize-stats.cpp
+1 −0 examples/quantize/quantize.cpp
+12 −18 examples/save-load-state/save-load-state.cpp
+16 −4 examples/server/README.md
+270 −119 examples/server/server.cpp
+21 −0 examples/simple/README.md
+109 −41 examples/simple/simple.cpp
+13 −10 examples/speculative/speculative.cpp
+8 −3 examples/train-text-from-scratch/README.md
+9 −5 examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+539 −1,500 examples/train-text-from-scratch/train-text-from-scratch.cpp
+1 −1 flake.nix
+8 −2 ggml-alloc.c
+1 −0 ggml-alloc.h
+326 −149 ggml-cuda.cu
+1 −0 ggml-cuda.h
+147 −47 ggml-metal.m
+193 −40 ggml-metal.metal
+119 −53 ggml-opencl.cpp
+2,355 −1,166 ggml.c
+122 −35 ggml.h
+293 −149 gguf-py/gguf/gguf.py
+1 −1 gguf-py/pyproject.toml
+744 −2 k_quants.c
+2,359 −744 llama.cpp
+320 −106 llama.h
+ models/ggml-vocab-aquila.gguf
+ models/ggml-vocab-falcon.gguf
+4 −4 pocs/vdot/q8dot.cpp
+49 −0 prompts/LLM-questions.txt
+43 −0 prompts/parallel-questions.txt
+2 −0 scripts/LlamaConfig.cmake.in
+6 −5 tests/CMakeLists.txt
+134 −71 tests/test-grad0.cpp
+2 −31 tests/test-opt.cpp
+14 −15 tests/test-quantize-perf.cpp
+221 −0 tests/test-rope.cpp
+15 −6 tests/test-tokenizer-0-falcon.cpp
+7 −5 tests/test-tokenizer-0-llama.cpp
+113 −0 tests/test-tokenizer-1-bpe.cpp
+11 −32 tests/test-tokenizer-1-llama.cpp
+462 −0 unicode.h
16 changes: 8 additions & 8 deletions llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ func New(model string, opts ...ModelOption) (*LLama, error) {

result := C.load_model(modelPath,
C.int(mo.ContextSize), C.int(mo.Seed),
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap),
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
C.bool(MulMatQ), loraAdapter, loraBase, C.bool(mo.Perplexity),
C.bool(MulMatQ), loraAdapter, loraBase, C.float(mo.LoraScale), C.bool(mo.Perplexity),
)

if result == nil {
Expand Down Expand Up @@ -112,7 +112,7 @@ func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32,
// float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit , bool prompt_cache_ro,
// float rope_freq_base, float rope_freq_scale, float negative_prompt_scale, const char* negative_prompt
// );
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), nil, C.int(0),
Expand Down Expand Up @@ -154,7 +154,7 @@ func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -193,7 +193,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -238,7 +238,7 @@ func (l *LLama) SpeculativeSampling(ll *LLama, text string, opts ...PredictOptio
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -296,7 +296,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -346,7 +346,7 @@ func (l *LLama) TokenizeString(text string, opts ...PredictOption) (int32, []int
var fakeDblPtr **C.char

// copy pasted and modified minimally. Should I simplify down / do we need an "allocate defaults"
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), fakeDblPtr, C.int(0),
Expand Down
7 changes: 5 additions & 2 deletions llama_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ how much is 2+2?
Expect(err).ToNot(HaveOccurred())
Expect(model).ToNot(BeNil())
text, err := model.SpeculativeSampling(model2, `[INST] Answer to the following question:
how much is 2+2?
Do a simple math calculation: How much is 2+2?
[/INST]`, llama.SetNDraft(16),
)
Expect(err).ToNot(HaveOccurred(), text)
Expand All @@ -97,7 +97,10 @@ how much is 2+2?
getModel := func() (*LLama, error) {
model, err := New(
testModelPath,
llama.EnableF16Memory, llama.SetContext(128), llama.EnableEmbeddings, llama.SetGPULayers(10),
llama.EnableF16Memory,
llama.SetContext(128),
llama.EnableEmbeddings,
llama.SetGPULayers(10),
)
Expect(err).ToNot(HaveOccurred())
Expect(model).ToNot(BeNil())
Expand Down
21 changes: 15 additions & 6 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ type ModelOptions struct {
F16Memory bool
MLock bool
MMap bool
LowVRAM bool
Embeddings bool
NUMA bool
NGPULayers int
Expand All @@ -16,6 +15,7 @@ type ModelOptions struct {
FreqRopeBase float32
FreqRopeScale float32
MulMatQ *bool
LoraScale float32
LoraBase string
LoraAdapter string
Perplexity bool
Expand All @@ -29,6 +29,7 @@ type PredictOptions struct {
DebugMode bool
StopPrompts []string
IgnoreEOS bool
BatchThreads int

TailFreeSamplingZ float32
TypicalP float32
Expand Down Expand Up @@ -68,7 +69,6 @@ var DefaultModelOptions ModelOptions = ModelOptions{
MLock: false,
Embeddings: false,
MMap: true,
LowVRAM: false,
NBatch: 512,
FreqRopeBase: 10000,
FreqRopeScale: 1.0,
Expand All @@ -79,6 +79,7 @@ var DefaultOptions PredictOptions = PredictOptions{
Threads: 4,
Tokens: 128,
Penalty: 1.1,
BatchThreads: -1,
Repeat: 64,
Batch: 512,
NKeep: 64,
Expand Down Expand Up @@ -109,6 +110,18 @@ func SetLoraBase(s string) ModelOption {
}
}

func SetBatchThreads(b int) PredictOption {
return func(p *PredictOptions) {
p.BatchThreads = b
}
}

func SetLoraScale(f float32) ModelOption {
return func(p *ModelOptions) {
p.LoraScale = f
}
}

func SetLoraAdapter(s string) ModelOption {
return func(p *ModelOptions) {
p.LoraAdapter = s
Expand Down Expand Up @@ -219,10 +232,6 @@ func SetNegativePrompt(np string) PredictOption {
}
}

var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
p.LowVRAM = true
}

var EnableNUMA ModelOption = func(p *ModelOptions) {
p.NUMA = true
}
Expand Down
38 changes: 21 additions & 17 deletions patches/1902-cuda.patch
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
diff --git a/common/common.cpp b/common/common.cpp
index 2597ba0..e42ae73 100644
index ec181c6..9ba699b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1268,3 +1268,218 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
@@ -1345,3 +1345,222 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}
+
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base) {
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base, float lora_scale) {
+ gpt_params* lparams = new gpt_params;
+ fprintf(stderr, "%s: loading model %s\n", __func__, fname.c_str());
+
+ // Initialize the 'model' member with the 'fname' parameter
+ lparams->model = fname;
+ lparams->lora_base = lora_base;
+ lparams->lora_adapter = lora;
+ if (lora_scale == 0 && !lora_base.empty()) {
+ lora_scale = 1.0f;
+ }
+ if (!lora.empty()) {
+ lparams->lora_adapter.push_back(std::make_tuple(lora, lora_scale));
+ }
+ if (lparams->lora_adapter.empty()) {
+ lparams->use_mmap = false;
+ }
Expand All @@ -30,14 +35,14 @@ index 2597ba0..e42ae73 100644
+ return lparams;
+}
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity) {
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all) {
+ // load the model
+ gpt_params * lparams;
+// Temporary workaround for https://github.com/go-skynet/go-llama.cpp/issues/218
+#ifdef GGML_USE_CUBLAS
+ lparams = create_gpt_params_cuda(fname);
+#else
+ lparams = create_gpt_params(fname, lora, lora_base);
+ lparams = create_gpt_params(fname, lora, lora_base, lora_scale);
+#endif
+ llama_model * model;
+ llama_binding_state * state;
Expand All @@ -49,10 +54,8 @@ index 2597ba0..e42ae73 100644
+ lparams->embedding = embeddings;
+ lparams->use_mlock = mlock;
+ lparams->n_gpu_layers = n_gpu_layers;
+ lparams->perplexity = perplexity;
+ lparams->logits_all = logits_all;
+ lparams->use_mmap = mmap;
+
+ lparams->low_vram = low_vram;
+ if (rope_freq_base != 0.0f) {
+ lparams->rope_freq_base = rope_freq_base;
+ } else {
Expand Down Expand Up @@ -114,8 +117,9 @@ index 2597ba0..e42ae73 100644
+ int idx) {
+
+ struct gpt_params params = *g_params;
+
+ const int n_ctx = llama_n_ctx(ctx);
+ const int n_vocab = llama_n_vocab(ctx);
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+ const float temp = params.temp;
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
Expand All @@ -133,7 +137,7 @@ index 2597ba0..e42ae73 100644
+
+ llama_token id = 0;
+
+ float * logits = llama_get_logits(ctx) + idx * n_vocab;
+ float * logits = llama_get_logits_ith(ctx, idx);
+
+ // Apply params.logit_bias map
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
Expand Down Expand Up @@ -184,19 +188,19 @@ index 2597ba0..e42ae73 100644
+ if (mirostat == 1) {
+ static float mirostat_mu = 2.0f * mirostat_tau;
+ const int mirostat_m = 100;
+ llama_sample_temperature(ctx, &cur_p, temp);
+ llama_sample_temp(ctx, &cur_p, temp);
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+ } else if (mirostat == 2) {
+ static float mirostat_mu = 2.0f * mirostat_tau;
+ llama_sample_temperature(ctx, &cur_p, temp);
+ llama_sample_temp(ctx, &cur_p, temp);
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+ } else {
+ // Temperature sampling
+ llama_sample_top_k (ctx, &cur_p, top_k, 1);
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
+ llama_sample_typical (ctx, &cur_p, typical_p, 1);
+ llama_sample_top_p (ctx, &cur_p, top_p, 1);
+ llama_sample_temperature(ctx, &cur_p, temp);
+ llama_sample_temp(ctx, &cur_p, temp);
+
+ {
+ const int n_top = 10;
Expand All @@ -223,10 +227,10 @@ index 2597ba0..e42ae73 100644
+}
\ No newline at end of file
diff --git a/common/common.h b/common/common.h
index 18aea38..ca7a168 100644
index 0e2d3fa..9992d2b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -209,3 +209,19 @@ std::string get_sortable_timestamp();
@@ -221,3 +221,19 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
Expand All @@ -236,7 +240,7 @@ index 18aea38..ca7a168 100644
+ llama_model * model;
+};
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity);
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all);
+
+llama_token llama_sample_token_binding(
+ struct llama_context * ctx,
Expand Down

0 comments on commit 231a797

Please sign in to comment.