NVIDIA-NeMo
diff --git a/‎nemoguardrails/rails/llm/llmrails.py‎
Lines changed: 5 additions & 0 deletions b/‎nemoguardrails/rails/llm/llmrails.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 1 addition & 1 deletion b/‎poetry.lock‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_callbacks.py‎
Lines changed: 170 additions & 0 deletions b/‎tests/test_callbacks.py‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎tests/test_llmrails.py‎
Lines changed: 87 additions & 0 deletions b/‎tests/test_llmrails.py‎
Lines changed: 87 additions & 0 deletions
@@ -367,6 +367,11 @@ def _prepare_model_kwargs(self, model_config):
             if api_key:
                 kwargs["api_key"] = api_key
 
+        # enable streaming token usage when streaming is enabled
+        # providers that don't support this parameter will simply ignore it
+        if self.config.streaming:
+            kwargs["stream_usage"] = True
+
         return kwargs
 
     def _configure_main_llm_streaming(
 
@@ -74,7 +74,7 @@ opentelemetry-sdk = { version = ">=1.27.0,<2.0.0", optional = true }
 aiofiles = { version = ">=24.1.0", optional = true }
 
 # openai
-langchain-openai = { version = ">=0.0.5", optional = true }
+langchain-openai = { version = ">=0.1.0", optional = true }
 
 # eval
 tqdm = { version = ">=4.65,<5.0", optional = true }
 
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from uuid import uuid4
+
+import pytest
+from langchain.schema import Generation, LLMResult
+from langchain_core.messages import AIMessage
+from langchain_core.outputs import ChatGeneration
+
+from nemoguardrails.context import explain_info_var, llm_call_info_var, llm_stats_var
+from nemoguardrails.logging.callbacks import LoggingCallbackHandler
+from nemoguardrails.logging.explain import ExplainInfo, LLMCallInfo
+from nemoguardrails.logging.stats import LLMStats
+
+
+@pytest.mark.asyncio
+async def test_token_usage_tracking_with_usage_metadata():
+    """Test that token usage is tracked when usage_metadata is available (stream_usage=True scenario)."""
+
+    llm_call_info = LLMCallInfo()
+    llm_call_info_var.set(llm_call_info)
+
+    llm_stats = LLMStats()
+    llm_stats_var.set(llm_stats)
+
+    explain_info = ExplainInfo()
+    explain_info_var.set(explain_info)
+
+    handler = LoggingCallbackHandler()
+
+    # simulate the LLM response with usage metadata (as would happen with stream_usage=True)
+    ai_message = AIMessage(
+        content="Hello! How can I help you?",
+        usage_metadata={"input_tokens": 10, "output_tokens": 6, "total_tokens": 16},
+    )
+
+    chat_generation = ChatGeneration(message=ai_message)
+    llm_result = LLMResult(generations=[[chat_generation]])
+
+    # call the on_llm_end method
+    await handler.on_llm_end(llm_result, run_id=uuid4())
+
+    assert llm_call_info.total_tokens == 16
+    assert llm_call_info.prompt_tokens == 10
+    assert llm_call_info.completion_tokens == 6
+
+    assert llm_stats.get_stat("total_tokens") == 16
+    assert llm_stats.get_stat("total_prompt_tokens") == 10
+    assert llm_stats.get_stat("total_completion_tokens") == 6
+
+
+@pytest.mark.asyncio
+async def test_token_usage_tracking_with_llm_output_fallback():
+    """Test token usage tracking with legacy llm_output format."""
+
+    llm_call_info = LLMCallInfo()
+    llm_call_info_var.set(llm_call_info)
+
+    llm_stats = LLMStats()
+    llm_stats_var.set(llm_stats)
+
+    explain_info = ExplainInfo()
+    explain_info_var.set(explain_info)
+
+    handler = LoggingCallbackHandler()
+
+    # simulate LLM response with token usage in llm_output (fallback scenario)
+    generation = Generation(text="Fallback response")
+    llm_result = LLMResult(
+        generations=[[generation]],
+        llm_output={
+            "token_usage": {
+                "total_tokens": 20,
+                "prompt_tokens": 12,
+                "completion_tokens": 8,
+            }
+        },
+    )
+
+    await handler.on_llm_end(llm_result, run_id=uuid4())
+
+    assert llm_call_info.total_tokens == 20
+    assert llm_call_info.prompt_tokens == 12
+    assert llm_call_info.completion_tokens == 8
+
+    assert llm_stats.get_stat("total_tokens") == 20
+    assert llm_stats.get_stat("total_prompt_tokens") == 12
+    assert llm_stats.get_stat("total_completion_tokens") == 8
+
+
+@pytest.mark.asyncio
+async def test_no_token_usage_tracking_without_metadata():
+    """Test that no token usage is tracked when metadata is not available."""
+
+    llm_call_info = LLMCallInfo()
+    llm_call_info_var.set(llm_call_info)
+
+    llm_stats = LLMStats()
+    llm_stats_var.set(llm_stats)
+
+    explain_info = ExplainInfo()
+    explain_info_var.set(explain_info)
+
+    handler = LoggingCallbackHandler()
+
+    # simulate LLM response without usage metadata (stream_usage=False scenario)
+    ai_message = AIMessage(content="Hello! How can I help you?")
+    chat_generation = ChatGeneration(message=ai_message)
+    llm_result = LLMResult(generations=[[chat_generation]])
+
+    await handler.on_llm_end(llm_result, run_id=uuid4())
+
+    assert llm_call_info.total_tokens is None or llm_call_info.total_tokens == 0
+    assert llm_call_info.prompt_tokens is None or llm_call_info.prompt_tokens == 0
+    assert (
+        llm_call_info.completion_tokens is None or llm_call_info.completion_tokens == 0
+    )
+
+
+@pytest.mark.asyncio
+async def test_multiple_generations_token_accumulation():
+    """Test that token usage accumulates across multiple generations."""
+
+    llm_call_info = LLMCallInfo()
+    llm_call_info_var.set(llm_call_info)
+
+    llm_stats = LLMStats()
+    llm_stats_var.set(llm_stats)
+
+    explain_info = ExplainInfo()
+    explain_info_var.set(explain_info)
+
+    handler = LoggingCallbackHandler()
+
+    ai_message1 = AIMessage(
+        content="First response",
+        usage_metadata={"input_tokens": 5, "output_tokens": 3, "total_tokens": 8},
+    )
+
+    ai_message2 = AIMessage(
+        content="Second response",
+        usage_metadata={"input_tokens": 7, "output_tokens": 4, "total_tokens": 11},
+    )
+
+    chat_generation1 = ChatGeneration(message=ai_message1)
+    chat_generation2 = ChatGeneration(message=ai_message2)
+    llm_result = LLMResult(generations=[[chat_generation1, chat_generation2]])
+
+    await handler.on_llm_end(llm_result, run_id=uuid4())
+
+    assert llm_call_info.total_tokens == 19  # 8 + 11
+    assert llm_call_info.prompt_tokens == 12  # 5 + 7
+    assert llm_call_info.completion_tokens == 7  # 3 + 4
+
+    assert llm_stats.get_stat("total_tokens") == 19
+    assert llm_stats.get_stat("total_prompt_tokens") == 12
+    assert llm_stats.get_stat("total_completion_tokens") == 7
@@ -1068,3 +1068,90 @@ def __init__(self):
 
     assert kwargs["api_key"] == "direct-key"
     assert kwargs["temperature"] == 0.3
+
+
+@pytest.mark.asyncio
+@patch("nemoguardrails.rails.llm.llmrails.init_llm_model")
+async def test_stream_usage_enabled_for_streaming_supported_providers(
+    mock_init_llm_model,
+):
+    """Test that stream_usage=True is set when streaming is enabled for supported providers."""
+    config = RailsConfig.from_content(
+        config={
+            "models": [
+                {
+                    "type": "main",
+                    "engine": "openai",
+                    "model": "gpt-4",
+                }
+            ],
+            "streaming": True,
+        }
+    )
+
+    LLMRails(config=config)
+
+    mock_init_llm_model.assert_called_once()
+    call_args = mock_init_llm_model.call_args
+    kwargs = call_args.kwargs.get("kwargs", {})
+
+    assert kwargs.get("stream_usage") is True
+
+
+@pytest.mark.asyncio
+@patch("nemoguardrails.rails.llm.llmrails.init_llm_model")
+async def test_stream_usage_not_set_without_streaming(mock_init_llm_model):
+    """Test that stream_usage is not set when streaming is disabled."""
+    config = RailsConfig.from_content(
+        config={
+            "models": [
+                {
+                    "type": "main",
+                    "engine": "openai",
+                    "model": "gpt-4",
+                }
+            ],
+            "streaming": False,
+        }
+    )
+
+    LLMRails(config=config)
+
+    mock_init_llm_model.assert_called_once()
+    call_args = mock_init_llm_model.call_args
+    kwargs = call_args.kwargs.get("kwargs", {})
+
+    assert "stream_usage" not in kwargs
+
+
+@pytest.mark.asyncio
+@patch("nemoguardrails.rails.llm.llmrails.init_llm_model")
+async def test_stream_usage_enabled_for_all_providers_when_streaming(
+    mock_init_llm_model,
+):
+    """Test that stream_usage is passed to ALL providers when streaming is enabled.
+
+    With the new design, stream_usage=True is passed to ALL providers when
+    streaming is enabled. Providers that don't support it will simply ignore it.
+    """
+    config = RailsConfig.from_content(
+        config={
+            "models": [
+                {
+                    "type": "main",
+                    "engine": "unsupported",
+                    "model": "whatever",
+                }
+            ],
+            "streaming": True,
+        }
+    )
+
+    LLMRails(config=config)
+
+    mock_init_llm_model.assert_called_once()
+    call_args = mock_init_llm_model.call_args
+    kwargs = call_args.kwargs.get("kwargs", {})
+
+    # stream_usage should be set for all providers when streaming is enabled
+    assert kwargs.get("stream_usage") is True