server : do not block system prompt update (#3767)

ggerganov · web-flow · commit 1717521cdb97 · 2023-10-24T23:08:20.000+03:00
* server : do not block system prompt update

* server : update state machine logic to process system prompts

* server : minor
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -454,7 +454,7 @@ struct llama_client_slot
     }
 
     void release() {
-        if (state == PROCESSING)
+        if (state == IDLE || state == PROCESSING)
         {
             t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
             command = RELEASE;
@@ -754,6 +754,7 @@ struct llama_server_context
         }
 
         slot->params.antiprompt.clear();
+
         const auto &stop = data.find("stop");
         if (stop != data.end() && stop->is_array())
         {
@@ -867,7 +868,7 @@ struct llama_server_context
 
         kv_cache_clear();
 
-        for (int32_t i = 0; i < batch.n_tokens; ++i)
+        for (int i = 0; i < (int) system_tokens.size(); ++i)
         {
             llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
         }
@@ -894,16 +895,8 @@ struct llama_server_context
         {
             slot.release();
         }
-        wait_all_are_idle();
-        all_slots_are_idle = true;
 
-        // wait until system prompt load
         system_need_update = true;
-        while (system_need_update)
-        {
-            std::this_thread::sleep_for(std::chrono::milliseconds(5));
-        }
-        // system prompt loaded, continue
     }
 
     void process_system_prompt_data(const json &sys_props) {
@@ -915,26 +908,6 @@ struct llama_server_context
         {
             notify_system_prompt_changed();
         }
-        else
-        {
-            system_need_update = true;
-        }
-    }
-
-    void wait_all_are_idle() {
-        bool wait = true;
-        while (wait)
-        {
-            wait = false;
-            for (auto &slot : slots)
-            {
-                if (!slot.available())
-                {
-                    wait = true;
-                    break;
-                }
-            }
-        }
     }
 
     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@@ -965,7 +938,6 @@ struct llama_server_context
                     slot.has_next_token = false;
                 }
                 stop_pos = pos;
-
             }
         }
 
@@ -1444,7 +1416,7 @@ struct llama_server_context
         process_tasks();
 
         // update the system prompt wait until all slots are idle state
-        if (system_need_update)
+        if (system_need_update && all_slots_are_idle)
         {
             LOG_TEE("updating system prompt\n");
             update_system_prompt();
@@ -1498,7 +1470,7 @@ struct llama_server_context
         for (auto & slot : slots)
         {
             // release the slot
-            if (slot.state == PROCESSING && slot.command == RELEASE)
+            if (slot.command == RELEASE)
             {
                 slot.state = IDLE;
                 slot.command = NONE;
@@ -1509,7 +1481,7 @@ struct llama_server_context
                 continue;
             }
 
-            if (slot.state == IDLE || slot.command == RELEASE)
+            if (slot.state == IDLE)
             {
                 continue;
             }
@@ -1530,6 +1502,17 @@ struct llama_server_context
         {
             for (auto & slot : slots)
             {
+                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
+
+                // empty prompt passed -> release the slot and send empty response
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
+                {
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    continue;
+                }
+
                 // need process the prompt
                 if (slot.state == IDLE && slot.command == LOAD_PROMPT)
                 {
@@ -1749,8 +1732,8 @@ struct llama_server_context
                 if (!process_token(result, slot))
                 {
                     slot.release();
-                    send_final_response(slot);
                     slot.print_timings();
+                    send_final_response(slot);
                 }
 
                 slot.i_batch = -1;
@@ -2285,7 +2268,7 @@ int main(int argc, char **argv)
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
                     task_result result = llama.next_result(task_id);
-                    if(!result.error && result.stop) {
+                    if (!result.error && result.stop) {
                         res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
                     }
                     else
@@ -2312,7 +2295,7 @@ int main(int argc, char **argv)
                                 {
                                     return false;
                                 }
-                                if(result.stop) {
+                                if (result.stop) {
                                     break;
                                 }
                             } else {

Original file line number	Diff line number	Diff line change
`@@ -454,7 +454,7 @@ struct llama_client_slot`
`454`	`454`	`}`
`455`	`455`
`456`	`456`	`void release() {`
`457`		`- if (state == PROCESSING)`
	`457`	`+ if (state == IDLE \|\| state == PROCESSING)`
`458`	`458`	`{`
`459`	`459`	`t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;`
`460`	`460`	`command = RELEASE;`
`@@ -754,6 +754,7 @@ struct llama_server_context`
`754`	`754`	`}`
`755`	`755`
`756`	`756`	`slot->params.antiprompt.clear();`
	`757`	`+`
`757`	`758`	`const auto &stop = data.find("stop");`
`758`	`759`	`if (stop != data.end() && stop->is_array())`
`759`	`760`	`{`
`@@ -867,7 +868,7 @@ struct llama_server_context`
`867`	`868`
`868`	`869`	`kv_cache_clear();`
`869`	`870`
`870`		`- for (int32_t i = 0; i < batch.n_tokens; ++i)`
	`871`	`+ for (int i = 0; i < (int) system_tokens.size(); ++i)`
`871`	`872`	`{`
`872`	`873`	`llama_batch_add(batch, system_tokens[i], i, { 0 }, false);`
`873`	`874`	`}`
`@@ -894,16 +895,8 @@ struct llama_server_context`
`894`	`895`	`{`
`895`	`896`	`slot.release();`
`896`	`897`	`}`
`897`		`- wait_all_are_idle();`
`898`		`- all_slots_are_idle = true;`
`899`	`898`
`900`		`- // wait until system prompt load`
`901`	`899`	`system_need_update = true;`
`902`		`- while (system_need_update)`
`903`		`- {`
`904`		`- std::this_thread::sleep_for(std::chrono::milliseconds(5));`
`905`		`- }`
`906`		`- // system prompt loaded, continue`
`907`	`900`	`}`
`908`	`901`
`909`	`902`	`void process_system_prompt_data(const json &sys_props) {`
`@@ -915,26 +908,6 @@ struct llama_server_context`
`915`	`908`	`{`
`916`	`909`	`notify_system_prompt_changed();`
`917`	`910`	`}`
`918`		`- else`
`919`		`- {`
`920`		`- system_need_update = true;`
`921`		`- }`
`922`		`- }`
`923`		`-`
`924`		`- void wait_all_are_idle() {`
`925`		`- bool wait = true;`
`926`		`- while (wait)`
`927`		`- {`
`928`		`- wait = false;`
`929`		`- for (auto &slot : slots)`
`930`		`- {`
`931`		`- if (!slot.available())`
`932`		`- {`
`933`		`- wait = true;`
`934`		`- break;`
`935`		`- }`
`936`		`- }`
`937`		`- }`
`938`	`911`	`}`
`939`	`912`
`940`	`913`	`static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,`
`@@ -965,7 +938,6 @@ struct llama_server_context`
`965`	`938`	`slot.has_next_token = false;`
`966`	`939`	`}`
`967`	`940`	`stop_pos = pos;`
`968`		`-`
`969`	`941`	`}`
`970`	`942`	`}`
`971`	`943`
`@@ -1444,7 +1416,7 @@ struct llama_server_context`
`1444`	`1416`	`process_tasks();`
`1445`	`1417`
`1446`	`1418`	`// update the system prompt wait until all slots are idle state`
`1447`		`- if (system_need_update)`
	`1419`	`+ if (system_need_update && all_slots_are_idle)`
`1448`	`1420`	`{`
`1449`	`1421`	`LOG_TEE("updating system prompt\n");`
`1450`	`1422`	`update_system_prompt();`
`@@ -1498,7 +1470,7 @@ struct llama_server_context`
`1498`	`1470`	`for (auto & slot : slots)`
`1499`	`1471`	`{`
`1500`	`1472`	`// release the slot`
`1501`		`- if (slot.state == PROCESSING && slot.command == RELEASE)`
	`1473`	`+ if (slot.command == RELEASE)`
`1502`	`1474`	`{`
`1503`	`1475`	`slot.state = IDLE;`
`1504`	`1476`	`slot.command = NONE;`
`@@ -1509,7 +1481,7 @@ struct llama_server_context`
`1509`	`1481`	`continue;`
`1510`	`1482`	`}`
`1511`	`1483`
`1512`		`- if (slot.state == IDLE \|\| slot.command == RELEASE)`
	`1484`	`+ if (slot.state == IDLE)`
`1513`	`1485`	`{`
`1514`	`1486`	`continue;`
`1515`	`1487`	`}`
`@@ -1530,6 +1502,17 @@ struct llama_server_context`
`1530`	`1502`	`{`
`1531`	`1503`	`for (auto & slot : slots)`
`1532`	`1504`	`{`
	`1505`	`+ const bool has_prompt = slot.prompt.is_array() \|\| (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());`
	`1506`	`+`
	`1507`	`+ // empty prompt passed -> release the slot and send empty response`
	`1508`	`+ if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)`
	`1509`	`+ {`
	`1510`	`+ slot.release();`
	`1511`	`+ slot.print_timings();`
	`1512`	`+ send_final_response(slot);`
	`1513`	`+ continue;`
	`1514`	`+ }`
	`1515`	`+`
`1533`	`1516`	`// need process the prompt`
`1534`	`1517`	`if (slot.state == IDLE && slot.command == LOAD_PROMPT)`
`1535`	`1518`	`{`
`@@ -1749,8 +1732,8 @@ struct llama_server_context`
`1749`	`1732`	`if (!process_token(result, slot))`
`1750`	`1733`	`{`
`1751`	`1734`	`slot.release();`
`1752`		`- send_final_response(slot);`
`1753`	`1735`	`slot.print_timings();`
	`1736`	`+ send_final_response(slot);`
`1754`	`1737`	`}`
`1755`	`1738`
`1756`	`1739`	`slot.i_batch = -1;`
`@@ -2285,7 +2268,7 @@ int main(int argc, char **argv)`
`2285`	`2268`	`if (!json_value(data, "stream", false)) {`
`2286`	`2269`	`std::string completion_text;`
`2287`	`2270`	`task_result result = llama.next_result(task_id);`
`2288`		`- if(!result.error && result.stop) {`
	`2271`	`+ if (!result.error && result.stop) {`
`2289`	`2272`	`res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");`
`2290`	`2273`	`}`
`2291`	`2274`	`else`
`@@ -2312,7 +2295,7 @@ int main(int argc, char **argv)`
`2312`	`2295`	`{`
`2313`	`2296`	`return false;`
`2314`	`2297`	`}`
`2315`		`- if(result.stop) {`
	`2298`	`+ if (result.stop) {`
`2316`	`2299`	`break;`
`2317`	`2300`	`}`
`2318`	`2301`	`} else {`