Skip to content

Commit f95ed01

Browse files
committed
Merge branch 'master' into Nexes_CQ_10
2 parents accd71d + edc2656 commit f95ed01

File tree

5 files changed

+108
-242
lines changed

5 files changed

+108
-242
lines changed

common/arg.cpp

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,14 +1163,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11631163
[](common_params & params, int value) {
11641164
params.grp_attn_n = value;
11651165
}
1166-
).set_env("LLAMA_ARG_GRP_ATTN_N"));
1166+
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
11671167
add_opt(common_arg(
11681168
{"-gaw", "--grp-attn-w"}, "N",
1169-
string_format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
1169+
string_format("group-attention width (default: %d)", params.grp_attn_w),
11701170
[](common_params & params, int value) {
11711171
params.grp_attn_w = value;
11721172
}
1173-
).set_env("LLAMA_ARG_GRP_ATTN_W"));
1173+
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
11741174
add_opt(common_arg(
11751175
{"-dkvc", "--dump-kv-cache"},
11761176
"verbose print of the KV cache",
@@ -1788,23 +1788,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17881788
params.n_threads_http = value;
17891789
}
17901790
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1791-
add_opt(common_arg(
1792-
{"-spf", "--system-prompt-file"}, "FNAME",
1793-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
1794-
[](common_params & params, const std::string & value) {
1795-
std::ifstream file(value);
1796-
if (!file) {
1797-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1798-
}
1799-
std::string system_prompt;
1800-
std::copy(
1801-
std::istreambuf_iterator<char>(file),
1802-
std::istreambuf_iterator<char>(),
1803-
std::back_inserter(system_prompt)
1804-
);
1805-
params.system_prompt = system_prompt;
1806-
}
1807-
).set_examples({LLAMA_EXAMPLE_SERVER}));
18081791
add_opt(common_arg(
18091792
{"--metrics"},
18101793
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ struct common_params {
282282
std::string hostname = "127.0.0.1";
283283
std::string public_path = ""; // NOLINT
284284
std::string chat_template = ""; // NOLINT
285-
std::string system_prompt = ""; // NOLINT
286285
bool enable_chat_template = true;
287286

288287
std::vector<std::string> api_keys;

examples/server/README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ The project is under active development, and we are [looking for feedback and co
6060
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
6161
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
6262
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
63-
| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
64-
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
6563
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
6664
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
6765
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
@@ -149,7 +147,6 @@ The project is under active development, and we are [looking for feedback and co
149147
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
150148
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
151149
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
152-
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
153150
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
154151
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
155152
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
@@ -320,7 +317,6 @@ node index.js
320317

321318
- The prompt is a string or an array with the first element given as a string
322319
- The model's `tokenizer.ggml.add_bos_token` metadata is `true`
323-
- The system prompt is empty
324320

325321
`temperature`: Adjust the randomness of the generated text. Default: `0.8`
326322

@@ -378,6 +374,8 @@ node index.js
378374

379375
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
380376

377+
`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
378+
381379
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
382380

383381
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
@@ -536,14 +534,12 @@ This endpoint is public (no API key check). By default, it is read-only. To make
536534

537535
```json
538536
{
539-
"system_prompt": "",
540537
"default_generation_settings": { ... },
541538
"total_slots": 1,
542539
"chat_template": ""
543540
}
544541
```
545542

546-
- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
547543
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
548544
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
549545
- `chat_template` - the model's original Jinja2 prompt template
@@ -554,7 +550,7 @@ To use this endpoint with POST method, you need to start server with `--props`
554550

555551
*Options:*
556552

557-
- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
553+
- None yet
558554

559555
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
560556

0 commit comments

Comments
 (0)