Skip to content

Commit d0d276f

Browse files
committed
Merge branch 'master' into Nexes_CQ30
2 parents 7cecefd + 418f5ee commit d0d276f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3058
-2521
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1717

1818
## Hot topics
1919

20-
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
20+
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
21+
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
2122
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
2223

2324
----

common/arg.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -943,13 +943,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
943943
params.sparams.min_p = std::stof(value);
944944
}
945945
).set_sparam());
946-
add_opt(common_arg(
947-
{"--tfs"}, "N",
948-
string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
949-
[](common_params & params, const std::string & value) {
950-
params.sparams.tfs_z = std::stof(value);
951-
}
952-
).set_sparam());
953946
add_opt(common_arg(
954947
{"--xtc-probability"}, "N",
955948
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
@@ -1074,7 +1067,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10741067
).set_sparam());
10751068
add_opt(common_arg(
10761069
{"--mirostat"}, "N",
1077-
string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
1070+
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
10781071
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
10791072
[](common_params & params, int value) {
10801073
params.sparams.mirostat = value;

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2090,7 +2090,6 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
20902090
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
20912091
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
20922092

2093-
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
20942093
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
20952094
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
20962095
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);

common/common.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ enum common_sampler_type {
8888
COMMON_SAMPLER_TYPE_TOP_K = 2,
8989
COMMON_SAMPLER_TYPE_TOP_P = 3,
9090
COMMON_SAMPLER_TYPE_MIN_P = 4,
91-
COMMON_SAMPLER_TYPE_TFS_Z = 5,
91+
//COMMON_SAMPLER_TYPE_TFS_Z = 5,
9292
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
9393
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
9494
COMMON_SAMPLER_TYPE_XTC = 8,
@@ -113,7 +113,6 @@ struct common_sampler_params {
113113
float min_p = 0.05f; // 0.0 = disabled
114114
float xtc_probability = 0.00f; // 0.0 = disabled
115115
float xtc_threshold = 0.10f; // > 0.5 disables XTC
116-
float tfs_z = 1.00f; // 1.0 = disabled
117116
float typ_p = 1.00f; // typical_p, 1.0 = disabled
118117
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
119118
float dynatemp_range = 0.00f; // 0.0 = disabled
@@ -139,7 +138,6 @@ struct common_sampler_params {
139138
std::vector<enum common_sampler_type> samplers = {
140139
COMMON_SAMPLER_TYPE_DRY,
141140
COMMON_SAMPLER_TYPE_TOP_K,
142-
COMMON_SAMPLER_TYPE_TFS_Z,
143141
COMMON_SAMPLER_TYPE_TYPICAL_P,
144142
COMMON_SAMPLER_TYPE_TOP_P,
145143
COMMON_SAMPLER_TYPE_MIN_P,

common/sampling.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -131,11 +131,11 @@ std::string common_sampler_params::print() const {
131131
snprintf(result, sizeof(result),
132132
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133133
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
134-
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
134+
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
135135
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
136136
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
137137
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
138-
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
138+
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
139139
mirostat, mirostat_eta, mirostat_tau);
140140

141141
return std::string(result);
@@ -199,9 +199,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
199199
case COMMON_SAMPLER_TYPE_XTC:
200200
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
201201
break;
202-
case COMMON_SAMPLER_TYPE_TFS_Z:
203-
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
204-
break;
205202
case COMMON_SAMPLER_TYPE_TYPICAL_P:
206203
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
207204
break;
@@ -373,7 +370,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
373370
switch (cnstr) {
374371
case COMMON_SAMPLER_TYPE_DRY: return 'd';
375372
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
376-
case COMMON_SAMPLER_TYPE_TFS_Z: return 'f';
377373
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
378374
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
379375
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
@@ -388,7 +384,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
388384
switch (cnstr) {
389385
case COMMON_SAMPLER_TYPE_DRY: return "dry";
390386
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
391-
case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z";
392387
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
393388
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
394389
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
@@ -406,7 +401,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
406401
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
407402
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
408403
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
409-
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
410404
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
411405
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
412406
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
@@ -423,8 +417,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
423417
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
424418
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
425419
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
426-
{ "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z },
427-
{ "tfs", COMMON_SAMPLER_TYPE_TFS_Z },
428420
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
429421
};
430422

@@ -452,7 +444,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
452444
std::unordered_map<char, common_sampler_type> sampler_name_map = {
453445
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
454446
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
455-
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z },
456447
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
457448
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
458449
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },

convert_lora_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
230230

231231
def parse_args() -> argparse.Namespace:
232232
parser = argparse.ArgumentParser(
233-
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
233+
description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
234234
parser.add_argument(
235235
"--outfile", type=Path,
236236
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -257,11 +257,11 @@ def parse_args() -> argparse.Namespace:
257257
)
258258
parser.add_argument(
259259
"--base", type=Path, required=True,
260-
help="directory containing base model file",
260+
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
261261
)
262262
parser.add_argument(
263263
"lora_path", type=Path,
264-
help="directory containing LoRA adapter file",
264+
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
265265
)
266266

267267
return parser.parse_args()

examples/llama-bench/llama-bench.cpp

Lines changed: 22 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@
2121
#include "ggml.h"
2222
#include "llama.h"
2323
#include "common.h"
24-
#include "ggml-cuda.h"
25-
#include "ggml-sycl.h"
26-
27-
#ifdef GGML_USE_CANN
28-
#include "ggml-cann.h"
29-
#endif
3024

3125
#ifdef _WIN32
3226
#define WIN32_LEAN_AND_MEAN
@@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
8276
}
8377

8478
static std::string get_cpu_info() {
85-
std::string id;
86-
#ifdef __linux__
87-
FILE * f = fopen("/proc/cpuinfo", "r");
88-
if (f) {
89-
char buf[1024];
90-
while (fgets(buf, sizeof(buf), f)) {
91-
if (strncmp(buf, "model name", 10) == 0) {
92-
char * p = strchr(buf, ':');
93-
if (p) {
94-
p++;
95-
while (std::isspace(*p)) {
96-
p++;
97-
}
98-
while (std::isspace(p[strlen(p) - 1])) {
99-
p[strlen(p) - 1] = '\0';
100-
}
101-
id = p;
102-
break;
103-
}
104-
}
79+
std::vector<std::string> cpu_list;
80+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
81+
auto * dev = ggml_backend_dev_get(i);
82+
auto dev_type = ggml_backend_dev_type(dev);
83+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
84+
cpu_list.push_back(ggml_backend_dev_description(dev));
10585
}
106-
fclose(f);
107-
}
108-
#elif defined(_WIN32)
109-
HKEY hKey;
110-
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
111-
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
112-
0,
113-
KEY_READ,
114-
&hKey) != ERROR_SUCCESS) {
115-
// fail to open registry key
116-
return "";
11786
}
118-
char cpu_brand[256];
119-
DWORD cpu_brand_size = sizeof(cpu_brand);
120-
if (RegQueryValueExA(hKey,
121-
TEXT("ProcessorNameString"),
122-
NULL,
123-
NULL,
124-
(LPBYTE)cpu_brand,
125-
&cpu_brand_size) == ERROR_SUCCESS) {
126-
id.assign(cpu_brand, cpu_brand_size);
127-
if (id.find('\0') != std::string::npos) {
128-
id.resize(id.find('\0'));
129-
}
130-
}
131-
RegCloseKey(hKey);
132-
#endif
133-
// TODO: other platforms
134-
return id;
87+
return join(cpu_list, ", ");
13588
}
13689

13790
static std::string get_gpu_info() {
138-
std::string id;
139-
#ifdef GGML_USE_CUDA
140-
int count = ggml_backend_cuda_get_device_count();
141-
for (int i = 0; i < count; i++) {
142-
char buf[128];
143-
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
144-
id += buf;
145-
if (i < count - 1) {
146-
id += "/";
91+
std::vector<std::string> gpu_list;
92+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
93+
auto * dev = ggml_backend_dev_get(i);
94+
auto dev_type = ggml_backend_dev_type(dev);
95+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
96+
gpu_list.push_back(ggml_backend_dev_description(dev));
14797
}
14898
}
149-
#endif
150-
#ifdef GGML_USE_SYCL
151-
int count = ggml_backend_sycl_get_device_count();
152-
for (int i = 0; i < count; i++) {
153-
char buf[128];
154-
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
155-
id += buf;
156-
if (i < count - 1) {
157-
id += "/";
158-
}
159-
}
160-
#endif
161-
#ifdef GGML_USE_CANN
162-
uint32_t count = ggml_backend_cann_get_device_count();
163-
for (uint32_t i = 0; i < count; i++) {
164-
char buf[128];
165-
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
166-
id += buf;
167-
if (i < count - 1) {
168-
id += "/";
169-
}
170-
}
171-
#endif
172-
// TODO: other backends
173-
return id;
99+
return join(gpu_list, ", ");
174100
}
175101

176102
// command line params
@@ -938,29 +864,15 @@ struct test {
938864
}
939865

940866
static std::string get_backend() {
941-
if (cuda) {
942-
return GGML_CUDA_NAME;
943-
}
944-
if (vulkan) {
945-
return "Vulkan";
946-
}
947-
if (kompute) {
948-
return "Kompute";
949-
}
950-
if (metal) {
951-
return "Metal";
952-
}
953-
if (sycl) {
954-
return GGML_SYCL_NAME;
955-
}
956-
if (gpu_blas) {
957-
return "GPU BLAS";
958-
}
959-
if (blas) {
960-
return "BLAS";
867+
std::vector<std::string> backends;
868+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
869+
auto * reg = ggml_backend_reg_get(i);
870+
std::string name = ggml_backend_reg_name(reg);
871+
if (name != "CPU") {
872+
backends.push_back(ggml_backend_reg_name(reg));
873+
}
961874
}
962-
963-
return "CPU";
875+
return backends.empty() ? "CPU" : join(backends, ",");
964876
}
965877

966878
static const std::vector<std::string> & get_fields() {

examples/main/README.md

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -235,14 +235,6 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e
235235

236236
Example usage: `--min-p 0.05`
237237

238-
### Tail-Free Sampling (TFS)
239-
240-
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
241-
242-
Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
243-
244-
Example usage: `--tfs 0.95`
245-
246238
### Locally Typical Sampling
247239

248240
- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
@@ -341,6 +333,15 @@ These options help improve the performance and memory usage of the LLaMA models.
341333

342334
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
343335

336+
## LoRA (Low-Rank Adaptation) adapters
337+
338+
- `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters.
339+
- `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters.
340+
341+
You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`.
342+
343+
LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed.
344+
344345
## Additional Options
345346

346347
These options provide extra functionality and customization when running the LLaMA models:
@@ -349,6 +350,4 @@ These options provide extra functionality and customization when running the LLa
349350
- `--verbose-prompt`: Print the prompt before generating text.
350351
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
351352
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
352-
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
353-
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
354353
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.

0 commit comments

Comments
 (0)