Add streaming support

johann-petrak · johann-petrak · commit d9bba83c7601 · 2025-04-25T15:02:41.000+02:00
diff --git a/llms_wrapper/llms.py b/llms_wrapper/llms.py
@@ -593,6 +593,7 @@ def query(
             return_response: bool = False,
             debug=False,
             litellm_debug=None,
+            stream=True,
             recursive_call_info: Optional[Dict[str, any]] = None,  
             **kwargs,
     ) -> Dict[str, any]:
@@ -609,6 +610,9 @@ def query(
             return_response: whether or not the complete reponse should get returned
             debug: if True, emits debug messages to aid development and debugging
             litellm_debug: if True, litellm debug logging is enabled, if False, disabled, if None, use debug setting
+            stream: if True, the returned object containst the stream that can be iterated over. Streaming
+                may not work for all models.
+            recursive_call_info: internal use only
             kwargs: any additional keyword arguments to pass on to the LLM 
 
         Returns:
@@ -657,6 +661,11 @@ def query(
             fmap = toolnames2funcs(tools)
         else:
             fmap = {}
+        if stream:
+            # TODO: check if model supports streaming
+            # if streaming is enabled, we always return the original response
+            return_response = True
+            completion_kwargs["stream"] = True
         ret = {}
         # before adding the kwargs, save the recursive_call_info and remove it from kwargs
         if debug:
@@ -687,6 +696,13 @@ def query(
                 model=llm["llm"],
                 messages=messages,
                 **completion_kwargs)
+            if stream:
+                # TODO: for now we take a shortcut here and simply return the original response
+                #   as "response".
+                ret["response"] = response
+                ret["ok"] = True
+                ret["error"] = ""
+                return ret
             elapsed = time.time() - start
             logger.debug(f"Full Response: {response}")
             llm["_elapsed_time"] += elapsed
@@ -743,10 +759,14 @@ def query(
             if debug:
                 print(f"DEBUG: checking for tool_calls: {response_message}, have tools: {tools is not None}")
             if tools is not None:
+                # TODO: if streaming is enabled we need to gather the complete response before
+                #   we can process the tool calls
                 if hasattr(response_message, "tool_calls") and response_message.tool_calls is not None:
                     tool_calls = response_message.tool_calls
                 else:
                     tool_calls = []
+                if stream:
+                    raise ValueError("Error: streaming is not supported for tool calls yet")
                 if debug:
                     print(f"DEBUG: got {len(tool_calls)} tool calls:")
                     for tool_call in tool_calls:
diff --git a/llms_wrapper/version.py b/llms_wrapper/version.py
@@ -1,3 +1,3 @@
 import importlib.metadata
-__version__ = "0.2.0"
+__version__ = "0.3.0"
 
diff --git a/notebooks/test-streaming.ipynb b/notebooks/test-streaming.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "744579c6-ec0c-4c73-bb38-5c99a566056f",
+   "metadata": {},
+   "source": [
+    "# test-tooling.ipynb\n",
+    "\n",
+    "Test the API implementation of tooling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a6da6e33-e3dd-45d3-abad-ad48a617b1db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, sys\n",
+    "from typing import Optional, List, Dict\n",
+    "sys.path.append(os.path.join(\"..\"))\n",
+    "from llms_wrapper.llms import LLMS, toolnames2funcs, get_func_by_name\n",
+    "from llms_wrapper.config import update_llm_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7401b8b9-af81-4a1e-9bfa-bef00ec3ea68",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['openai/gpt-4o',\n",
+       " 'openai/gpt-4o-mini',\n",
+       " 'gemini/gemini-2.0-flash-exp',\n",
+       " 'gemini/gemini-1.5-flash',\n",
+       " 'gemini/gemini-1.5-pro',\n",
+       " 'anthropic/claude-3-5-sonnet-20240620',\n",
+       " 'anthropic/claude-3-opus-20240229',\n",
+       " 'mistral/mistral-large-latest',\n",
+       " 'xai/grok-beta',\n",
+       " 'groq/llama3-70b-8192',\n",
+       " 'groq/llama-3.3-70b-versatile',\n",
+       " 'deepseek/deepseek-chat']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "config = dict(\n",
+    "    llms=[\n",
+    "        # OpenAI\n",
+    "        # https://platform.openai.com/docs/models\n",
+    "        dict(llm=\"openai/gpt-4o\"),\n",
+    "        dict(llm=\"openai/gpt-4o-mini\"),\n",
+    "        # dict(llm=\"openai/o1\"),        # restricted\n",
+    "        # dict(llm=\"openai/o1-mini\"),   # restricted\n",
+    "        # Google Gemini\n",
+    "        # https://ai.google.dev/gemini-api/docs/models/gemini\n",
+    "        dict(llm=\"gemini/gemini-2.0-flash-exp\"),\n",
+    "        dict(llm=\"gemini/gemini-1.5-flash\"),\n",
+    "        dict(llm=\"gemini/gemini-1.5-pro\"),\n",
+    "        # Anthropic\n",
+    "        # https://docs.anthropic.com/en/docs/about-claude/models\n",
+    "        dict(llm=\"anthropic/claude-3-5-sonnet-20240620\"),\n",
+    "        dict(llm=\"anthropic/claude-3-opus-20240229\"),\n",
+    "        # Mistral\n",
+    "        # https://docs.mistral.ai/getting-started/models/models_overview/\n",
+    "        dict(llm=\"mistral/mistral-large-latest\"),\n",
+    "        # XAI\n",
+    "        # dict(llm=\"xai/grok-2\"),     # not mapped by litellm yet?\n",
+    "        dict(llm=\"xai/grok-beta\"),\n",
+    "        # Groq\n",
+    "        # https://console.groq.com/docs/models\n",
+    "        dict(llm=\"groq/llama3-70b-8192\"),\n",
+    "        dict(llm=\"groq/llama-3.3-70b-versatile\"),\n",
+    "        # Deepseek\n",
+    "        # https://api-docs.deepseek.com/quick_start/pricing\n",
+    "        dict(llm=\"deepseek/deepseek-chat\"),\n",
+    "    ],\n",
+    "    providers = dict(\n",
+    "        openai = dict(api_key_env=\"MY_OPENAI_API_KEY\"),\n",
+    "        gemini = dict(api_key_env=\"MY_GEMINI_API_KEY\"),\n",
+    "        anthropic = dict(api_key_env=\"MY_ANTHROPIC_API_KEY\"),\n",
+    "        mistral = dict(api_key_env=\"MY_MISTRAL_API_KEY\"),\n",
+    "        xai = dict(api_key_env=\"MY_XAI_API_KEY\"),    \n",
+    "        groq = dict(api_key_env=\"MY_GROQ_API_KEY\"),\n",
+    "        deepseek = dict(api_key_env=\"MY_DEEPSEEK_API_KEY\"),\n",
+    "    )\n",
+    ")\n",
+    "config = update_llm_config(config)\n",
+    "llms = LLMS(config)\n",
+    "llms.list_aliases()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09dfe4bc",
+   "metadata": {},
+   "source": [
+    "## Test streaming\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a89258b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'content': 'What is a monoid? Give me a simple example. Provide your answer as plain text, do not use Markdown formatting.',\n",
+       "  'role': 'user'}]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "msgs = LLMS.make_messages(\"What is a monoid? Give me a simple example. Provide your answer as plain text, do not use Markdown formatting.\")\n",
+    "msgs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "491b0ddb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A monoid is an algebraic structure with a single associative binary operation and an identity element. Specifically, a set M is a monoid if it is equipped with a binary operation (let's call it *) that satisfies the following properties:\n",
+      "\n",
+      "1. **Associativity**: For all elements a, b, and c in M, the equation (a * b) * c = a * (b * c) holds.\n",
+      "2. **Identity Element**: There exists an element e in M such that for every element a in M, the equation e * a = a * e = a holds.\n",
+      "\n",
+      "A simple example of a monoid is the set of natural numbers (including zero) with the operation of addition. \n",
+      "\n",
+      "- The set is {0, 1, 2, 3, ...}.\n",
+      "- The binary operation is addition (+).\n",
+      "- The identity element is 0 because adding 0 to any natural number does not change the number (0 + a = a + 0 = a).\n",
+      "- Addition is associative because for any natural numbers a, b, and c, the equation (a + b) + c = a + (b + c) is always true."
+     ]
+    }
+   ],
+   "source": [
+    "ret = llms.query(\"openai/gpt-4o\", msgs, temperature=0.5, max_tokens=1000, stream=True)\n",
+    "if ret[\"ok\"]:\n",
+    "    for chunk in ret[\"response\"]:\n",
+    "        choice0 = chunk.choices[0]\n",
+    "        if choice0.finish_reason == \"stop\":\n",
+    "            break       \n",
+    "        content = choice0.delta.content    \n",
+    "        print(content, end=\"\", flush=True)\n",
+    "else:\n",
+    "    print(\"Error:\", ret[\"error\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0eeea159",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llms_wrapper",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`import importlib.metadata`
`2`		`-__version__ = "0.2.0"`
	`2`	`+__version__ = "0.3.0"`
`3`	`3`