NVIDIA-NeMo · Pouyanpi · May 5, 2025 · Apr 24, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/nemoguardrails/actions/llm/generation.py b/nemoguardrails/actions/llm/generation.py
@@ -50,13 +50,14 @@
     generation_options_var,
     llm_call_info_var,
     raw_llm_request,
+    reasoning_trace_var,
     streaming_handler_var,
 )
 from nemoguardrails.embeddings.index import EmbeddingsIndex, IndexItem
 from nemoguardrails.kb.kb import KnowledgeBase
 from nemoguardrails.llm.params import llm_params
 from nemoguardrails.llm.prompts import get_prompt
-from nemoguardrails.llm.taskmanager import LLMTaskManager
+from nemoguardrails.llm.taskmanager import LLMTaskManager, ParsedTaskOutput
 from nemoguardrails.llm.types import Task
 from nemoguardrails.logging.explain import LLMCallInfo
 from nemoguardrails.patch_asyncio import check_sync_call_from_async_loop
@@ -442,6 +443,7 @@ async def generate_user_intent(
             result = self.llm_task_manager.parse_task_output(
                 Task.GENERATE_USER_INTENT, output=result
             )
+            result = result.text
 
             user_intent = get_first_nonempty_line(result)
             if user_intent is None:
@@ -530,6 +532,11 @@ async def generate_user_intent(
                     text = self.llm_task_manager.parse_task_output(
                         Task.GENERAL, output=text
                     )
+
+                    text = _process_parsed_output(
+                        text, self._include_reasoning_traces()
+                    )
+
             else:
                 # Initialize the LLMCallInfo object
                 llm_call_info_var.set(LLMCallInfo(task=Task.GENERAL.value))
@@ -565,6 +572,8 @@ async def generate_user_intent(
                 text = self.llm_task_manager.parse_task_output(
                     Task.GENERAL, output=result
                 )
+
+                text = _process_parsed_output(text, self._include_reasoning_traces())
                 text = text.strip()
                 if text.startswith('"'):
                     text = text[1:-1]
@@ -646,6 +655,7 @@ async def generate_next_step(
             result = self.llm_task_manager.parse_task_output(
                 Task.GENERATE_NEXT_STEPS, output=result
             )
+            result = result.text
 
             # If we don't have multi-step generation enabled, we only look at the first line.
             if not self.config.enable_multi_step_generation:
@@ -900,6 +910,10 @@ async def generate_bot_message(
                             Task.GENERAL, output=result
                         )
 
+                        result = _process_parsed_output(
+                            result, self._include_reasoning_traces()
+                        )
+
                     log.info(
                         "--- :: LLM Bot Message Generation passthrough call took %.2f seconds",
                         time() - t0,
@@ -963,6 +977,10 @@ async def generate_bot_message(
                     Task.GENERATE_BOT_MESSAGE, output=result
                 )
 
+                result = _process_parsed_output(
+                    result, self._include_reasoning_traces()
+                )
+
                 # TODO: catch openai.error.InvalidRequestError from exceeding max token length
 
                 result = get_multiline_response(result)
@@ -1055,6 +1073,7 @@ async def generate_value(
         result = self.llm_task_manager.parse_task_output(
             Task.GENERATE_VALUE, output=result
         )
+        result = result.text
 
         # We only use the first line for now
         # TODO: support multi-line values?
@@ -1266,6 +1285,7 @@ async def generate_intent_steps_message(
             result = self.llm_task_manager.parse_task_output(
                 Task.GENERATE_INTENT_STEPS_MESSAGE, output=result
             )
+            result = result.text
 
             # TODO: Implement logic for generating more complex Colang next steps (multi-step),
             #  not just a single bot intent.
@@ -1348,6 +1368,7 @@ async def generate_intent_steps_message(
             result = self.llm_task_manager.parse_task_output(
                 Task.GENERAL, output=result
             )
+            result = _process_parsed_output(result, self._include_reasoning_traces())
             text = result.strip()
             if text.startswith('"'):
                 text = text[1:-1]
@@ -1360,6 +1381,10 @@ async def generate_intent_steps_message(
                 events=[new_event_dict("BotMessage", text=text)],
             )
 
+    def _include_reasoning_traces(self) -> bool:
+        """Get the configuration value for whether to include reasoning traces in output."""
+        return _get_apply_to_reasoning_traces(self.config)
+
 
 def clean_utterance_content(utterance: str) -> str:
     """
@@ -1377,3 +1402,27 @@ def clean_utterance_content(utterance: str) -> str:
         # It should be translated to an actual \n character.
         utterance = utterance.replace("\\n", "\n")
     return utterance
+
+
+def _record_reasoning_trace(trace: str) -> None:
+    """Store the reasoning trace in context for later retrieval."""
+    reasoning_trace_var.set(trace)
+
+
+def _assemble_response(text: str, trace: Optional[str], include_reasoning: bool) -> str:
+    """Combine trace and text if requested, otherwise just return text."""
+    return (trace + text) if (trace and include_reasoning) else text
+
+
+def _process_parsed_output(
+    output: ParsedTaskOutput, include_reasoning_trace: bool
+) -> str:
+    """Record trace, then assemble the final LLM response."""
+    if reasoning_trace := output.reasoning_trace:
+        _record_reasoning_trace(reasoning_trace)
+    return _assemble_response(output.text, reasoning_trace, include_reasoning_trace)
+
+
+def _get_apply_to_reasoning_traces(config: RailsConfig) -> bool:
+    """Get the configuration value for whether to include reasoning traces in output."""
+    return config.rails.output.apply_to_reasoning_traces
diff --git a/nemoguardrails/actions/llm/utils.py b/nemoguardrails/actions/llm/utils.py
@@ -24,7 +24,7 @@
 
 from nemoguardrails.colang.v2_x.lang.colang_ast import Flow
 from nemoguardrails.colang.v2_x.runtime.flows import InternalEvent, InternalEvents
-from nemoguardrails.context import llm_call_info_var
+from nemoguardrails.context import llm_call_info_var, reasoning_trace_var
 from nemoguardrails.logging.callbacks import logging_callbacks
 from nemoguardrails.logging.explain import LLMCallInfo
 
@@ -192,7 +192,7 @@ def get_colang_history(
                     and event["action_name"] == "retrieve_relevant_chunks"
                 ):
                     continue
-                history += f'execute {event["action_name"]}\n'
+                history += f"execute {event['action_name']}\n"
             elif event["type"] == "InternalSystemActionFinished" and not event.get(
                 "is_system_action"
             ):
@@ -577,3 +577,15 @@ def escape_flow_name(name: str) -> str:
     # removes non-word chars and leading digits in a word
     result = re.sub(r"\b\d+|[^\w\s]", "", result)
     return result
+
+
+def get_and_clear_reasoning_trace_contextvar() -> Optional[str]:
+    """Get the current reasoning trace and clear it from the context.
+
+    Returns:
+        Optional[str]: The reasoning trace if one exists, None otherwise.
+    """
+    if reasoning_trace := reasoning_trace_var.get():
+        reasoning_trace_var.set(None)
+        return reasoning_trace
+    return None
diff --git a/nemoguardrails/actions/v2_x/generation.py b/nemoguardrails/actions/v2_x/generation.py
@@ -197,7 +197,7 @@ async def _collect_user_intent_and_examples(
 
             # We add these in reverse order so the most relevant is towards the end.
             for result in reversed(results):
-                examples += f"user action: user said \"{result.text}\"\nuser intent: {result.meta['intent']}\n\n"
+                examples += f'user action: user said "{result.text}"\nuser intent: {result.meta["intent"]}\n\n'
                 if result.meta["intent"] not in potential_user_intents:
                     potential_user_intents.append(result.meta["intent"])
 
@@ -302,6 +302,8 @@ async def generate_user_intent(
             Task.GENERATE_USER_INTENT_FROM_USER_ACTION, output=result
         )
 
+        result = result.text
+
         user_intent = get_first_nonempty_line(result)
         # GTP-4o often adds 'user intent: ' in front
         if user_intent and ":" in user_intent:
@@ -378,6 +380,8 @@ async def generate_user_intent_and_bot_action(
             Task.GENERATE_USER_INTENT_AND_BOT_ACTION_FROM_USER_ACTION, output=result
         )
 
+        result = result.text
+
         user_intent = get_first_nonempty_line(result)
 
         if user_intent and ":" in user_intent:
@@ -458,6 +462,8 @@ async def passthrough_llm_action(
 
             text = self.llm_task_manager.parse_task_output(Task.GENERAL, output=text)
 
+            text = result.text
+
         return text
 
     @action(name="CheckValidFlowExistsAction", is_system_action=True)
@@ -541,6 +547,8 @@ async def generate_flow_from_instructions(
             task=Task.GENERATE_FLOW_FROM_INSTRUCTIONS, output=result
         )
 
+        result = result.text
+
         # TODO: why this is not part of a filter or output_parser?
         #
         lines = _remove_leading_empty_lines(result).split("\n")
@@ -613,6 +621,8 @@ async def generate_flow_from_name(
             task=Task.GENERATE_FLOW_FROM_NAME, output=result
         )
 
+        result = result.text
+
         lines = _remove_leading_empty_lines(result).split("\n")
 
         if lines[0].startswith("flow"):
@@ -680,6 +690,8 @@ async def generate_flow_continuation(
             task=Task.GENERATE_FLOW_CONTINUATION, output=result
         )
 
+        result = result.text
+
         lines = _remove_leading_empty_lines(result).split("\n")
 
         if len(lines) == 0 or (len(lines) == 1 and lines[0] == ""):
@@ -806,6 +818,8 @@ async def generate_value(
             Task.GENERATE_VALUE_FROM_INSTRUCTION, output=result
         )
 
+        result = result.text
+
         # We only use the first line for now
         # TODO: support multi-line values?
         value = result.strip().split("\n")[0]
@@ -913,6 +927,8 @@ async def generate_flow(
             Task.GENERATE_FLOW_CONTINUATION_FROM_NLD, output=result
         )
 
+        result = result.text
+
         result = _remove_leading_empty_lines(result)
         lines = result.split("\n")
         if "codeblock" in lines[0]:

diff --git a/nemoguardrails/context.py b/nemoguardrails/context.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import contextvars
+from typing import Optional
 
 streaming_handler_var = contextvars.ContextVar("streaming_handler", default=None)
 
@@ -32,3 +33,7 @@
 # The raw LLM request that comes from the user.
 # This is used in passthrough mode.
 raw_llm_request = contextvars.ContextVar("raw_llm_request", default=None)
+
+reasoning_trace_var: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar(
+    "reasoning_trace", default=None
+)
diff --git a/nemoguardrails/library/content_safety/actions.py b/nemoguardrails/library/content_safety/actions.py
@@ -80,6 +80,7 @@ async def content_safety_check_input(
         result = await llm_call(llm, check_input_prompt, stop=stop)
 
     result = llm_task_manager.parse_task_output(task, output=result)
+    result = result.text
 
     try:
         is_safe, violated_policies = result
@@ -162,6 +163,8 @@ async def content_safety_check_output(
 
     result = llm_task_manager.parse_task_output(task, output=result)
 
+    result = result.text
+
     try:
         is_safe, violated_policies = result
     except TypeError:

diff --git a/nemoguardrails/library/self_check/facts/actions.py b/nemoguardrails/library/self_check/facts/actions.py
@@ -82,6 +82,7 @@ async def self_check_facts(
             task, output=response, forced_output_parser="is_content_safe"
         )
 
+    result = result.text
     is_not_safe, _ = result
 
     result = float(not is_not_safe)

diff --git a/nemoguardrails/library/self_check/input_check/actions.py b/nemoguardrails/library/self_check/input_check/actions.py
@@ -83,6 +83,7 @@ async def self_check_input(
                 task, output=response, forced_output_parser="is_content_safe"
             )
 
+        result = result.text
         is_safe, _ = result
 
         if not is_safe:

diff --git a/nemoguardrails/library/self_check/output_check/actions.py b/nemoguardrails/library/self_check/output_check/actions.py
@@ -87,6 +87,7 @@ async def self_check_output(
                 task, output=response, forced_output_parser="is_content_safe"
             )
 
+        result = result.text
         is_safe, _ = result
 
         return is_safe