@@ -454,7 +454,7 @@ struct llama_client_slot
454
454
}
455
455
456
456
void release () {
457
- if (state == PROCESSING)
457
+ if (state == IDLE || state == PROCESSING)
458
458
{
459
459
t_token_generation = (ggml_time_us () - t_start_genereration) / 1e3 ;
460
460
command = RELEASE;
@@ -754,6 +754,7 @@ struct llama_server_context
754
754
}
755
755
756
756
slot->params .antiprompt .clear ();
757
+
757
758
const auto &stop = data.find (" stop" );
758
759
if (stop != data.end () && stop->is_array ())
759
760
{
@@ -867,7 +868,7 @@ struct llama_server_context
867
868
868
869
kv_cache_clear ();
869
870
870
- for (int32_t i = 0 ; i < batch. n_tokens ; ++i)
871
+ for (int i = 0 ; i < ( int ) system_tokens. size () ; ++i)
871
872
{
872
873
llama_batch_add (batch, system_tokens[i], i, { 0 }, false );
873
874
}
@@ -894,16 +895,8 @@ struct llama_server_context
894
895
{
895
896
slot.release ();
896
897
}
897
- wait_all_are_idle ();
898
- all_slots_are_idle = true ;
899
898
900
- // wait until system prompt load
901
899
system_need_update = true ;
902
- while (system_need_update)
903
- {
904
- std::this_thread::sleep_for (std::chrono::milliseconds (5 ));
905
- }
906
- // system prompt loaded, continue
907
900
}
908
901
909
902
void process_system_prompt_data (const json &sys_props) {
@@ -915,26 +908,6 @@ struct llama_server_context
915
908
{
916
909
notify_system_prompt_changed ();
917
910
}
918
- else
919
- {
920
- system_need_update = true ;
921
- }
922
- }
923
-
924
- void wait_all_are_idle () {
925
- bool wait = true ;
926
- while (wait )
927
- {
928
- wait = false ;
929
- for (auto &slot : slots)
930
- {
931
- if (!slot.available ())
932
- {
933
- wait = true ;
934
- break ;
935
- }
936
- }
937
- }
938
911
}
939
912
940
913
static size_t find_stopping_strings (const std::string &text, const size_t last_token_size,
@@ -965,7 +938,6 @@ struct llama_server_context
965
938
slot.has_next_token = false ;
966
939
}
967
940
stop_pos = pos;
968
-
969
941
}
970
942
}
971
943
@@ -1444,7 +1416,7 @@ struct llama_server_context
1444
1416
process_tasks ();
1445
1417
1446
1418
// update the system prompt wait until all slots are idle state
1447
- if (system_need_update)
1419
+ if (system_need_update && all_slots_are_idle )
1448
1420
{
1449
1421
LOG_TEE (" updating system prompt\n " );
1450
1422
update_system_prompt ();
@@ -1498,7 +1470,7 @@ struct llama_server_context
1498
1470
for (auto & slot : slots)
1499
1471
{
1500
1472
// release the slot
1501
- if (slot.state == PROCESSING && slot. command == RELEASE)
1473
+ if (slot.command == RELEASE)
1502
1474
{
1503
1475
slot.state = IDLE;
1504
1476
slot.command = NONE;
@@ -1509,7 +1481,7 @@ struct llama_server_context
1509
1481
continue ;
1510
1482
}
1511
1483
1512
- if (slot.state == IDLE || slot. command == RELEASE )
1484
+ if (slot.state == IDLE)
1513
1485
{
1514
1486
continue ;
1515
1487
}
@@ -1530,6 +1502,17 @@ struct llama_server_context
1530
1502
{
1531
1503
for (auto & slot : slots)
1532
1504
{
1505
+ const bool has_prompt = slot.prompt .is_array () || (slot.prompt .is_string () && !slot.prompt .get <std::string>().empty ());
1506
+
1507
+ // empty prompt passed -> release the slot and send empty response
1508
+ if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
1509
+ {
1510
+ slot.release ();
1511
+ slot.print_timings ();
1512
+ send_final_response (slot);
1513
+ continue ;
1514
+ }
1515
+
1533
1516
// need process the prompt
1534
1517
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
1535
1518
{
@@ -1749,8 +1732,8 @@ struct llama_server_context
1749
1732
if (!process_token (result, slot))
1750
1733
{
1751
1734
slot.release ();
1752
- send_final_response (slot);
1753
1735
slot.print_timings ();
1736
+ send_final_response (slot);
1754
1737
}
1755
1738
1756
1739
slot.i_batch = -1 ;
@@ -2285,7 +2268,7 @@ int main(int argc, char **argv)
2285
2268
if (!json_value (data, " stream" , false )) {
2286
2269
std::string completion_text;
2287
2270
task_result result = llama.next_result (task_id);
2288
- if (!result.error && result.stop ) {
2271
+ if (!result.error && result.stop ) {
2289
2272
res.set_content (result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace), " application/json" );
2290
2273
}
2291
2274
else
@@ -2312,7 +2295,7 @@ int main(int argc, char **argv)
2312
2295
{
2313
2296
return false ;
2314
2297
}
2315
- if (result.stop ) {
2298
+ if (result.stop ) {
2316
2299
break ;
2317
2300
}
2318
2301
} else {
0 commit comments