Improve log messages around the max sequence length (#103)

#### Motivation The existing messages were confusing to the users. #### Modifications In the router the error message was rephrased to make it more understandable for users who arent familiar with the internals. In the server we now print the maximum possible sequence length limited by the model sequence length. The existing print was showing how much output tokens can fit into the memory if you pass max_sequence_length input tokens and vice-versa. I don't know what I was thinking when I wrote that. #### Related Issues https://github.ibm.com/ai-foundation/watson-fm-stack-tracker/issues/958 --------- Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Maximilien de Bayser <mbayser@br.ibm.com>
IBM · Jun 28, 2024 · 5b5938e · 5b5938e
1 parent 009a2ba
commit 5b5938e
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 8 deletions.
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -222,8 +222,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
             self.batch_type.prefill_weight(&single_request_stats, 1);
         if max_batch_weight < single_request_prefill_weight {
             panic!(
-                "max_batch_weight ({}) not large enough for (prefill) max_sequence_length ({})",
-                max_batch_weight, max_sequence_length
+                "The provided max_sequence length ({}) results in a prefill batch weight that exceeds the estimated capacity ({})",
+                max_sequence_length, max_batch_weight
             )
         }
 
@@ -232,8 +232,8 @@ impl<'a, B: BatchType> BatchConfigValidator<'a, B> {
             .batch_initial_weight(&single_request_stats, 1);
         if max_batch_weight < single_request_nexttoken_weight {
             panic!(
-                "max_batch_weight ({}) not large enough for (next-token) max_sequence_length ({})",
-                max_batch_weight, max_sequence_length
+                "The provided max_sequence length ({}) results in a next-token batch weight that exceeds the estimated capacity ({})",
+                max_sequence_length, max_batch_weight
             )
         }
     }

diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -409,13 +409,15 @@ def estimate_memory():
                 memory_scaling_model = estimate_memory()
                 compile()
 
-            max_input = memory_scaling_model.max_input_len_for_nt(1, max_sequence_length-1, sys.maxsize)
-            max_output = memory_scaling_model.max_output_len_for_nt(1, max_sequence_length-1, sys.maxsize)
-
             if local_rank == 0:
+                # For a batch of size 1 and an output of 1, get max input limited by max_sequence_length
+                max_input  = memory_scaling_model.max_input_len_for_nt(1, 1, max_sequence_length)
+                # For a batch of size 1 and an input of 1, get max output limited by max_sequence_length
+                max_output = memory_scaling_model.max_output_len_for_nt(1, 1, max_sequence_length)
+                max_theoretical_len = min(max_input, max_output) + 1
                 print(
                     "Maximum possible sequence length given available memory (for batch size 1): "
-                    f"{min(max_input, max_output)}"
+                    f"{max_theoretical_len}"
                 )
 
         elif ESTIMATE_MEMORY == "manual":