Skip to content

Commit 1717521

Browse files
authoredOct 24, 2023
server : do not block system prompt update (#3767)
* server : do not block system prompt update * server : update state machine logic to process system prompts * server : minor
1 parent b2f7e04 commit 1717521

File tree

1 file changed

+20
-37
lines changed

1 file changed

+20
-37
lines changed
 

‎examples/server/server.cpp

+20-37
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ struct llama_client_slot
454454
}
455455

456456
void release() {
457-
if (state == PROCESSING)
457+
if (state == IDLE || state == PROCESSING)
458458
{
459459
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
460460
command = RELEASE;
@@ -754,6 +754,7 @@ struct llama_server_context
754754
}
755755

756756
slot->params.antiprompt.clear();
757+
757758
const auto &stop = data.find("stop");
758759
if (stop != data.end() && stop->is_array())
759760
{
@@ -867,7 +868,7 @@ struct llama_server_context
867868

868869
kv_cache_clear();
869870

870-
for (int32_t i = 0; i < batch.n_tokens; ++i)
871+
for (int i = 0; i < (int) system_tokens.size(); ++i)
871872
{
872873
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
873874
}
@@ -894,16 +895,8 @@ struct llama_server_context
894895
{
895896
slot.release();
896897
}
897-
wait_all_are_idle();
898-
all_slots_are_idle = true;
899898

900-
// wait until system prompt load
901899
system_need_update = true;
902-
while (system_need_update)
903-
{
904-
std::this_thread::sleep_for(std::chrono::milliseconds(5));
905-
}
906-
// system prompt loaded, continue
907900
}
908901

909902
void process_system_prompt_data(const json &sys_props) {
@@ -915,26 +908,6 @@ struct llama_server_context
915908
{
916909
notify_system_prompt_changed();
917910
}
918-
else
919-
{
920-
system_need_update = true;
921-
}
922-
}
923-
924-
void wait_all_are_idle() {
925-
bool wait = true;
926-
while (wait)
927-
{
928-
wait = false;
929-
for (auto &slot : slots)
930-
{
931-
if (!slot.available())
932-
{
933-
wait = true;
934-
break;
935-
}
936-
}
937-
}
938911
}
939912

940913
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@@ -965,7 +938,6 @@ struct llama_server_context
965938
slot.has_next_token = false;
966939
}
967940
stop_pos = pos;
968-
969941
}
970942
}
971943

@@ -1444,7 +1416,7 @@ struct llama_server_context
14441416
process_tasks();
14451417

14461418
// update the system prompt wait until all slots are idle state
1447-
if (system_need_update)
1419+
if (system_need_update && all_slots_are_idle)
14481420
{
14491421
LOG_TEE("updating system prompt\n");
14501422
update_system_prompt();
@@ -1498,7 +1470,7 @@ struct llama_server_context
14981470
for (auto & slot : slots)
14991471
{
15001472
// release the slot
1501-
if (slot.state == PROCESSING && slot.command == RELEASE)
1473+
if (slot.command == RELEASE)
15021474
{
15031475
slot.state = IDLE;
15041476
slot.command = NONE;
@@ -1509,7 +1481,7 @@ struct llama_server_context
15091481
continue;
15101482
}
15111483

1512-
if (slot.state == IDLE || slot.command == RELEASE)
1484+
if (slot.state == IDLE)
15131485
{
15141486
continue;
15151487
}
@@ -1530,6 +1502,17 @@ struct llama_server_context
15301502
{
15311503
for (auto & slot : slots)
15321504
{
1505+
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
1506+
1507+
// empty prompt passed -> release the slot and send empty response
1508+
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
1509+
{
1510+
slot.release();
1511+
slot.print_timings();
1512+
send_final_response(slot);
1513+
continue;
1514+
}
1515+
15331516
// need process the prompt
15341517
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
15351518
{
@@ -1749,8 +1732,8 @@ struct llama_server_context
17491732
if (!process_token(result, slot))
17501733
{
17511734
slot.release();
1752-
send_final_response(slot);
17531735
slot.print_timings();
1736+
send_final_response(slot);
17541737
}
17551738

17561739
slot.i_batch = -1;
@@ -2285,7 +2268,7 @@ int main(int argc, char **argv)
22852268
if (!json_value(data, "stream", false)) {
22862269
std::string completion_text;
22872270
task_result result = llama.next_result(task_id);
2288-
if(!result.error && result.stop) {
2271+
if (!result.error && result.stop) {
22892272
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
22902273
}
22912274
else
@@ -2312,7 +2295,7 @@ int main(int argc, char **argv)
23122295
{
23132296
return false;
23142297
}
2315-
if(result.stop) {
2298+
if (result.stop) {
23162299
break;
23172300
}
23182301
} else {

0 commit comments

Comments
 (0)
Please sign in to comment.