4242 "max_output_tokens" ,
4343 "max_input_tokens" ,
4444 "use_phoenix" ,
45+ "via_streaming" ,
4546 "min_delay" , # minimum delay between queries for that model
4647]
4748
@@ -641,6 +642,7 @@ def query(
641642 debug = False ,
642643 litellm_debug = None ,
643644 stream = False ,
645+ via_stream = False ,
644646 recursive_call_info : Optional [Dict [str , any ]] = None ,
645647 ** kwargs ,
646648 ) -> Dict [str , any ]:
@@ -657,8 +659,10 @@ def query(
657659 return_response: whether or not the complete reponse should get returned
658660 debug: if True, emits debug messages to aid development and debugging
659661 litellm_debug: if True, litellm debug logging is enabled, if False, disabled, if None, use debug setting
660- stream: if True, the returned object containst the stream that can be iterated over. Streaming
662+ stream: if True, the returned object contains the stream that can be iterated over. Streaming
661663 may not work for all models.
664+ via_stream: if True, ignores the stream parameters, the response data is retrieved internally via streaming.
665+ This may be useful if the non-streaming response keeps timing out.
662666 recursive_call_info: internal use only
663667 kwargs: any additional keyword arguments to pass on to the LLM
664668
@@ -708,7 +712,10 @@ def query(
708712 fmap = toolnames2funcs (tools )
709713 else :
710714 fmap = {}
711- if stream :
715+ if via_stream :
716+ # TODO: check if model supports streaming
717+ completion_kwargs ["stream" ] = True
718+ elif stream :
712719 # TODO: check if model supports streaming
713720 # if streaming is enabled, we always return the original response
714721 return_response = True
@@ -743,7 +750,36 @@ def query(
743750 model = llm ["llm" ],
744751 messages = messages ,
745752 ** completion_kwargs )
746- if stream :
753+ if via_stream :
754+ # retrieve the response using streaming, return once we have everything
755+ try :
756+ answer = ""
757+ for chunk in response :
758+ choice0 = chunk ["choices" ][0 ]
759+ if choice0 .finish_reason == "stop" :
760+ logger .debug (f"DEBUG: streaming got stop. Chunk { chunk ['index' ]} : { chunk ['value' ]} " )
761+ break
762+ content = choice0 ["delta" ].get ("content" , "" )
763+ logger .debug (f"DEBUG: streaming content: { content } " )
764+ answer += content
765+ answer += content
766+ if return_response :
767+ ret ["response" ] = response
768+ ret ["cost" ] = None
769+ ret ["elapsed_time" ] = time .time () - start
770+ ret ["ok" ] = True
771+ ret ["error" ] = ""
772+ return ret
773+ except Exception as e :
774+ tb = traceback .extract_tb (e .__traceback__ )
775+ filename , lineno , funcname , text = tb [- 1 ]
776+ ret ["error" ] = str (e ) + f" in { filename } :{ lineno } { funcname } "
777+ if debug :
778+ logger .error (f"Returning error: { e } " )
779+ ret ["answer" ] = ""
780+ ret ["ok" ] = False
781+ return ret
782+ elif stream :
747783 def chunk_generator (model_generator , retobj ):
748784 try :
749785 for chunk in model_generator :
0 commit comments