@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
15851585            }
15861586        }
15871587
1588+         //  average size per token
1589+         const  float  size_per_token = std::max<float >(1 .0f , float (size ()) / (std::max<size_t >(1 , n_tokens ())));
1590+ 
1591+         //  dynamically increase the token limit if it can fit in the memory limit
1592+         const  size_t  limit_tokens_cur = limit_size > 0  ? std::max<size_t >(limit_tokens, limit_size/size_per_token) : limit_tokens;
1593+ 
15881594        if  (limit_tokens > 0 ) {
1589-             while  (states.size () > 1  && n_tokens () > limit_tokens ) {
1595+             while  (states.size () > 1  && n_tokens () > limit_tokens_cur ) {
15901596                if  (states.empty ()) {
15911597                    break ;
15921598                }
15931599
1594-                 SRV_WRN ("  - cache token limit reached, removing oldest entry (size = %.3f MiB)\n " front ().size () / (1024.0  * 1024.0 ));
1600+                 SRV_WRN ("  - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n " 
1601+                         limit_tokens, limit_tokens_cur, states.front ().size () / (1024.0  * 1024.0 ));
15951602
15961603                states.pop_front ();
15971604            }
15981605        }
15991606
1600-         SRV_WRN ("  - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n " 
1601-                 states.size (), size () / (1024.0  * 1024.0 ), limit_size / (1024.0  * 1024.0 ), limit_tokens);
1607+         SRV_WRN ("  - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est )\n " 
1608+                 states.size (), size () / (1024.0  * 1024.0 ), limit_size / (1024.0  * 1024.0 ), limit_tokens, limit_tokens_cur );
16021609
16031610        for  (const  auto  & state : states) {
1604-             SRV_WRN ("    - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n " const  void  *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0  * 1024.0 ));
1611+             SRV_WRN ("    - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n " 
1612+                     (const  void  *)&state, state.n_tokens (), state.checkpoints .size (), state.size () / (1024.0  * 1024.0 ));
16051613        }
16061614    }
16071615};
0 commit comments