Skip to content

Commit 95c76e8

Browse files
authored
server : remove legacy system_prompt feature (#9857)
* server : remove legacy system_prompt feature ggml-ci * readme : update [no ci] * server : fix non-transformer logic + remove response from /props
1 parent 11ac980 commit 95c76e8

File tree

4 files changed

+19
-108
lines changed

4 files changed

+19
-108
lines changed

Diff for: common/arg.cpp

-17
Original file line numberDiff line numberDiff line change
@@ -1788,23 +1788,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17881788
params.n_threads_http = value;
17891789
}
17901790
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1791-
add_opt(common_arg(
1792-
{"-spf", "--system-prompt-file"}, "FNAME",
1793-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
1794-
[](common_params & params, const std::string & value) {
1795-
std::ifstream file(value);
1796-
if (!file) {
1797-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1798-
}
1799-
std::string system_prompt;
1800-
std::copy(
1801-
std::istreambuf_iterator<char>(file),
1802-
std::istreambuf_iterator<char>(),
1803-
std::back_inserter(system_prompt)
1804-
);
1805-
params.system_prompt = system_prompt;
1806-
}
1807-
).set_examples({LLAMA_EXAMPLE_SERVER}));
18081791
add_opt(common_arg(
18091792
{"--metrics"},
18101793
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

Diff for: common/common.h

-1
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ struct common_params {
282282
std::string hostname = "127.0.0.1";
283283
std::string public_path = ""; // NOLINT
284284
std::string chat_template = ""; // NOLINT
285-
std::string system_prompt = ""; // NOLINT
286285
bool enable_chat_template = true;
287286

288287
std::vector<std::string> api_keys;

Diff for: examples/server/README.md

+1-5
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,6 @@ The project is under active development, and we are [looking for feedback and co
149149
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
150150
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
151151
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
152-
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
153152
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
154153
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
155154
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
@@ -320,7 +319,6 @@ node index.js
320319

321320
- The prompt is a string or an array with the first element given as a string
322321
- The model's `tokenizer.ggml.add_bos_token` metadata is `true`
323-
- The system prompt is empty
324322

325323
`temperature`: Adjust the randomness of the generated text. Default: `0.8`
326324

@@ -536,14 +534,12 @@ This endpoint is public (no API key check). By default, it is read-only. To make
536534

537535
```json
538536
{
539-
"system_prompt": "",
540537
"default_generation_settings": { ... },
541538
"total_slots": 1,
542539
"chat_template": ""
543540
}
544541
```
545542

546-
- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
547543
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
548544
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
549545
- `chat_template` - the model's original Jinja2 prompt template
@@ -554,7 +550,7 @@ To use this endpoint with POST method, you need to start server with `--props`
554550

555551
*Options:*
556552

557-
- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
553+
- None yet
558554

559555
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
560556

Diff for: examples/server/server.cpp

+18-85
Original file line numberDiff line numberDiff line change
@@ -623,12 +623,6 @@ struct server_context {
623623

624624
int32_t n_ctx; // total context for all clients / slots
625625

626-
// system prompt
627-
bool system_need_update = false;
628-
629-
std::string system_prompt;
630-
std::vector<llama_token> system_tokens;
631-
632626
// slots / clients
633627
std::vector<server_slot> slots;
634628
json default_generation_settings_for_props;
@@ -665,7 +659,7 @@ struct server_context {
665659
bool load_model(const common_params & params_) {
666660
params = params_;
667661

668-
// dedicate one sequence to the system prompt
662+
// reserve one extra sequence (seq_id == 0) for extra features
669663
params.n_parallel += 1;
670664

671665
common_init_result llama_init = common_init_from_params(params);
@@ -1061,51 +1055,6 @@ struct server_context {
10611055
clean_kv_cache = false;
10621056
}
10631057

1064-
void system_prompt_update() {
1065-
SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
1066-
1067-
kv_cache_clear();
1068-
system_tokens.clear();
1069-
1070-
if (!system_prompt.empty()) {
1071-
system_tokens = common_tokenize(ctx, system_prompt, true);
1072-
1073-
const int32_t n_batch = llama_n_batch(ctx);
1074-
const int32_t n_tokens_prompt = system_tokens.size();
1075-
1076-
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
1077-
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
1078-
1079-
common_batch_clear(batch);
1080-
1081-
for (int32_t j = 0; j < n_tokens; ++j) {
1082-
common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
1083-
}
1084-
1085-
if (llama_decode(ctx, batch) != 0) {
1086-
SRV_ERR("%s", "llama_decode() failed\n");
1087-
return;
1088-
}
1089-
}
1090-
1091-
// assign the system KV cache to all parallel sequences
1092-
for (int32_t i = 1; i <= params.n_parallel; ++i) {
1093-
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
1094-
}
1095-
}
1096-
1097-
system_need_update = false;
1098-
}
1099-
1100-
bool system_prompt_set(const std::string & sys_prompt) {
1101-
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
1102-
1103-
system_prompt = sys_prompt;
1104-
// update system_tokens and KV cache as soon as all slots are idle
1105-
system_need_update = true;
1106-
return true;
1107-
}
1108-
11091058
bool process_token(completion_token_output & result, server_slot & slot) {
11101059
// remember which tokens were sampled - used for repetition penalties during sampling
11111060
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
@@ -1855,12 +1804,8 @@ struct server_context {
18551804
}
18561805

18571806
if (all_idle) {
1858-
if (system_need_update) {
1859-
system_prompt_update();
1860-
}
1861-
18621807
SRV_INF("%s", "all slots are idle\n");
1863-
if (system_prompt.empty() && clean_kv_cache) {
1808+
if (clean_kv_cache) {
18641809
kv_cache_clear();
18651810
}
18661811

@@ -1882,7 +1827,7 @@ struct server_context {
18821827
// TODO: simplify and improve
18831828
for (server_slot & slot : slots) {
18841829
if (slot.ga_n == 1) {
1885-
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
1830+
if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) {
18861831
if (!params.ctx_shift) {
18871832
// this check is redundant (for good)
18881833
// we should never get here, because generation should already stopped in process_token()
@@ -1893,13 +1838,13 @@ struct server_context {
18931838

18941839
// Shift context
18951840
const int n_keep = slot.params.n_keep + add_bos_token;
1896-
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
1841+
const int n_left = slot.n_past - n_keep;
18971842
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
18981843

18991844
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
19001845

19011846
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
1902-
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
1847+
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
19031848

19041849
if (slot.params.cache_prompt) {
19051850
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1929,18 +1874,16 @@ struct server_context {
19291874

19301875
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
19311876

1932-
// TODO: we always have to take into account the "system_tokens"
1933-
// this is not great and needs to be improved somehow
1934-
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
1877+
common_batch_add(batch, slot.sampled, slot_npast, { slot.id + 1 }, true);
19351878

19361879
slot.n_past += 1;
19371880

19381881
if (slot.params.cache_prompt) {
19391882
slot.cache_tokens.push_back(slot.sampled);
19401883
}
19411884

1942-
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
1943-
slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
1885+
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
1886+
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
19441887
}
19451888

19461889
// process in chunks of params.n_batch
@@ -1971,7 +1914,7 @@ struct server_context {
19711914
case SERVER_TASK_CMPL_TYPE_NORMAL:
19721915
case SERVER_TASK_CMPL_TYPE_EMBEDDING:
19731916
{
1974-
prompt_tokens = tokenize(slot.prompt, system_prompt.empty(), true); // add BOS if there isn't system prompt
1917+
prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
19751918
} break;
19761919
case SERVER_TASK_CMPL_TYPE_RERANK:
19771920
{
@@ -2050,7 +1993,7 @@ struct server_context {
20501993
} else {
20511994
if (!params.ctx_shift) {
20521995
// if context shift is disabled, we make sure prompt size is smaller than KV size
2053-
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
1996+
if (slot.n_prompt_tokens >= slot.n_ctx) {
20541997
slot.release();
20551998
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
20561999
continue;
@@ -2138,22 +2081,19 @@ struct server_context {
21382081
}
21392082

21402083
// keep only the common part
2141-
int p0 = (int) system_tokens.size() + slot.n_past;
2084+
int p0 = slot.n_past;
2085+
21422086
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
21432087
// could not partially delete (likely using a non-Transformer model)
21442088
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
21452089

2146-
p0 = (int) system_tokens.size();
2147-
if (p0 != 0) {
2148-
// copy over the system prompt when there is one
2149-
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
2150-
}
2090+
p0 = 0;
21512091

2152-
// there is no common part left (except for the system prompt)
2092+
// there is no common part left
21532093
slot.n_past = 0;
21542094
slot.n_past_se = 0;
21552095
slot.ga_i = 0;
2156-
// TODO: is the system prompt ever in the sampling context?
2096+
21572097
common_sampler_reset(slot.smpl);
21582098
}
21592099

@@ -2179,7 +2119,7 @@ struct server_context {
21792119
}
21802120
}
21812121

2182-
common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
2122+
common_batch_add(batch, prompt_tokens[slot.n_past], slot_npast, { slot.id + 1 }, false);
21832123

21842124
if (slot.params.cache_prompt) {
21852125
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2409,10 +2349,6 @@ int main(int argc, char ** argv) {
24092349
// struct that contains llama context and inference
24102350
server_context ctx_server;
24112351

2412-
if (!params.system_prompt.empty()) {
2413-
ctx_server.system_prompt_set(params.system_prompt);
2414-
}
2415-
24162352
if (params.model_alias == "unknown") {
24172353
params.model_alias = params.model;
24182354
}
@@ -2840,7 +2776,6 @@ int main(int argc, char ** argv) {
28402776

28412777
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
28422778
json data = {
2843-
{ "system_prompt", ctx_server.system_prompt },
28442779
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
28452780
{ "total_slots", ctx_server.params.n_parallel },
28462781
{ "chat_template", llama_get_chat_template(ctx_server.model) },
@@ -2856,10 +2791,8 @@ int main(int argc, char ** argv) {
28562791
}
28572792

28582793
json data = json::parse(req.body);
2859-
if (data.contains("system_prompt")) {
2860-
std::string system_prompt = data.at("system_prompt");
2861-
ctx_server.system_prompt_set(system_prompt);
2862-
}
2794+
2795+
// update any props here
28632796

28642797
res_ok(res, {{ "success", true }});
28652798
};

0 commit comments

Comments
 (0)