feat: add multimodal io rails support

Pouyanpi · Pouyanpi · commit d772fc982e7a · 2025-03-07T13:27:04.000+01:00
feat: handle list type in event text processing

feat: add support for Together engine

fix(logging): ensure msg.content is a string in callbacks

fix(llm): ensure all responses and key items are strings

feat(prompts): add general task for OpenAI models

fix(prompts):  nsure model name is lowercase

fix(llm): improve multimodal content handling

fix(takmanager): handle multimodal content in message processing

feat: add tests for multimodality

feat: add example configs for vision content safety
diff --git a/examples/configs/content_safety_vision/config.yml b/examples/configs/content_safety_vision/config.yml
@@ -0,0 +1,15 @@
+models:
+  - type: main
+    engine: openai
+    model: gpt-4o-mini
+
+  - type: vision_rails
+    engine: openai
+    model: gpt-4o-mini
+rails:
+  input:
+    flows:
+      - content safety check input $model=vision_rails
+  dialog:
+    single_call:
+      enabled: False
diff --git a/examples/configs/content_safety_vision/prompts.yml b/examples/configs/content_safety_vision/prompts.yml
@@ -0,0 +1,45 @@
+prompts:
+  - task: content_safety_check_input $model=vision_rails
+    messages:
+      - type: system
+        content: |
+          You are an AI assistant that evaluates if a user's message contains unsafe content.
+          Carefully examine any text and images in the user's message.
+
+          {%- if categories is not defined -%}
+              {%- set categories = {
+                  "S1": "Violent Crimes.",
+                  "S2": "Non-Violent Crimes.",
+                  "S3": "Sex Crimes.",
+                  "S4": "Child Exploitation.",
+                  "S5": "Defamation.",
+                  "S6": "Specialized Advice.",
+                  "S7": "Privacy.",
+                  "S8": "Intellectual Property.",
+                  "S10": "Hate.",
+                  "S11": "Self-Harm.",
+                  "S12": "Sexual Content.",
+                  "S13": "Elections."
+              } -%}
+          {%- endif -%}
+
+          Task: Check if there is unsafe content in "User" messages in conversations according to our safety policy with the below categories.
+
+          <BEGIN UNSAFE CONTENT CATEGORIES>
+          {%- for key, desc in categories.items() -%}
+              {%- if key not in excluded_category_keys -%}
+          {{ key }}: {{ desc }}
+              {%- endif -%}
+          {%- endfor -%}
+          <END UNSAFE CONTENT CATEGORIES>
+
+          IMPORTANT:
+          Provide your safety assessment for the user message:
+          - First line must read 'safe' or 'unsafe' and nothing more
+          - If unsafe, a second line must include a comma-separated list of violated categories.
+      - type: user
+        content: "{{ user_input  }}"
+
+    stop: ["<|eot_id|>", "<|eom_id|>"]
+    output_parser: is_content_safe
+    max_tokens: 200
diff --git a/nemoguardrails/actions/llm/generation.py b/nemoguardrails/actions/llm/generation.py
@@ -21,8 +21,6 @@
 import re
 import sys
 import threading
-import uuid
-from ast import literal_eval
 from functools import lru_cache
 from time import time
 from typing import Callable, List, Optional, cast
@@ -374,6 +372,12 @@ async def generate_user_intent(
             # We search for the most relevant similar user utterance
             examples = ""
             potential_user_intents = []
+            if isinstance(event["text"], list):
+                text = " ".join(
+                    [item["text"] for item in event["text"] if item["type"] == "text"]
+                )
+            else:
+                text = event["text"]
 
             if self.user_message_index is not None:
                 threshold = None
@@ -384,7 +388,7 @@ async def generate_user_intent(
                     )
 
                 results = await self.user_message_index.search(
-                    text=event["text"], max_results=5, threshold=threshold
+                    text=text, max_results=5, threshold=threshold
                 )
 
                 # If the option to use only the embeddings is activated, we take the first
@@ -409,7 +413,7 @@ async def generate_user_intent(
                     )
                 else:
                     results = await self.user_message_index.search(
-                        text=event["text"], max_results=5
+                        text=text, max_results=5
                     )
                 # We add these in reverse order so the most relevant is towards the end.
                 for result in reversed(results):
diff --git a/nemoguardrails/llm/filters.py b/nemoguardrails/llm/filters.py
@@ -100,7 +100,7 @@ def co_v2(
                     history += f'  bot say "{event["script"]}"\n'
 
                 elif event["type"] == "StartTool":
-                    s = f'  await {event["flow_name"]}'
+                    s = f"  await {event['flow_name']}"
                     for k, v in event.items():
                         if k in [
                             "type",
@@ -275,13 +275,19 @@ def verbose_v1(colang_history: str) -> str:
 
 
 def to_chat_messages(events: List[dict]) -> str:
-    """Filter that turns an array of events into a sequence of user/assistant messages."""
+    """Filter that turns an array of events into a sequence of user/assistant messages.
+
+    Properly handles multimodal content by preserving the structure when the content
+    is in the format of a Message object with potential image_url content.
+    """
     messages = []
     for event in events:
         if event["type"] == "UserMessage":
-            messages.append({"type": "user", "content": event["text"]})
+            # Preserve the original structure when possible to support multimodal content
+            content = event["text"]
+            messages.append({"role": "user", "content": content})
         elif event["type"] == "StartUtteranceBotAction":
-            messages.append({"type": "assistant", "content": event["script"]})
+            messages.append({"role": "assistant", "content": event["script"]})
 
     return messages
 
@@ -296,11 +302,30 @@ def user_assistant_sequence(events: List[dict]) -> str:
        User: What can you do?
        Assistant: I can help with many things.
        ```
+
+    For multimodal content, it extracts text content and indicates if there were images.
     """
     history_items = []
     for event in events:
         if event["type"] == "UserMessage":
-            history_items.append("User: " + event["text"])
+            content = event["text"]
+            # Handle multimodal content by extracting text
+            if isinstance(content, list):
+                text_parts = []
+                has_images = False
+                for item in content:
+                    if isinstance(item, dict):
+                        if item.get("type") == "text":
+                            text_parts.append(item.get("text", ""))
+                        elif item.get("type") == "image_url":
+                            has_images = True
+                text_content = " ".join(text_parts)
+                if has_images:
+                    text_content += " [+ image]"
+                history_items.append("User: " + text_content)
+            else:
+                # Regular text content
+                history_items.append("User: " + str(content))
         elif event["type"] == "StartUtteranceBotAction":
             history_items.append("Assistant: " + event["script"])
 
@@ -375,7 +400,8 @@ def user_assistant_sequence_nemollm(events: List[dict]) -> str:
     history_items = []
     for event in events:
         if event["type"] == "UserMessage":
-            history_items.append("<extra_id_1>User\n" + event["text"])
+            # Convert text to string regardless of type (handles both text and multimodal)
+            history_items.append("<extra_id_1>User\n" + str(event["text"]))
         elif event["type"] == "StartUtteranceBotAction":
             history_items.append("<extra_id_1>Assistant\n" + event["script"])
 
diff --git a/nemoguardrails/llm/prompts.py b/nemoguardrails/llm/prompts.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 """Prompts for the various steps in the interaction."""
+
 import os
 from typing import List, Union
 
@@ -64,6 +65,7 @@ def _get_prompt(
     matching_prompt = None
     matching_score = 0
 
+    model = model.lower()
     for prompt in prompts:
         if prompt.task != task_name:
             continue
diff --git a/nemoguardrails/llm/prompts/openai-chatgpt.yml b/nemoguardrails/llm/prompts/openai-chatgpt.yml
@@ -1,5 +1,19 @@
 # Prompts for OpenAI ChatGPT.
 prompts:
+  - task: general
+    models:
+      - openai/gpt-3.5-turbo
+      - openai/gpt-4
+    messages:
+      - type: system
+        content: |
+          {{ general_instructions }}{% if relevant_chunks != None and relevant_chunks != '' %}
+          This is some relevant context:
+          ```markdown
+          {{ relevant_chunks }}
+          ```{% endif %}
+      - "{{ history | to_chat_messages }}"
+
   - task: generate_user_intent
     models:
       - openai/gpt-3.5-turbo
@@ -305,4 +319,4 @@ prompts:
     messages:
       - type: system
         content: |-
-            {{ flow_nld }}
+          {{ flow_nld }}
diff --git a/nemoguardrails/llm/providers/providers.py b/nemoguardrails/llm/providers/providers.py
@@ -304,6 +304,15 @@ def get_llm_provider(model_config: Model) -> Type[BaseLLM]:
                 "Could not import langchain_google_vertexai, please install it with "
                 "`pip install langchain-google-vertexai`."
             )
+    elif model_config.engine == "together":
+        try:
+            from langchain_together.chat_models import ChatTogether
+
+            return ChatTogether
+        except ImportError:
+            raise ImportError(
+                "Could not import langchain_together, please install it with "
+            )
 
     else:
         return _providers[model_config.engine]
diff --git a/nemoguardrails/llm/taskmanager.py b/nemoguardrails/llm/taskmanager.py
@@ -207,7 +207,16 @@ def _get_messages_text_length(self, messages: List[dict]) -> int:
         """Return the length of the text in the messages."""
         text = ""
         for message in messages:
-            text += message["content"] + "\n"
+            content = message["content"]
+            # Handle multimodal content (when content is a list)
+            if isinstance(content, list):
+                # Extract text from multimodal content
+                for item in content:
+                    if isinstance(item, dict) and item.get("type") == "text":
+                        text += item.get("text", "") + "\n"
+            else:
+                # Regular string content
+                text += content + "\n"
         return len(text)
 
     def render_task_prompt(
diff --git a/nemoguardrails/logging/callbacks.py b/nemoguardrails/logging/callbacks.py
@@ -113,7 +113,7 @@ async def on_chat_model_start(
                 )
                 + "[/]"
                 + "\n"
-                + msg.content
+                + (msg.content if isinstance(msg.content, str) else "")
                 for msg in messages[0]
             ]
         )
diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py
@@ -758,7 +758,13 @@ async def generate_async(
 
         if exception:
             new_message = {"role": "exception", "content": exception}
+
         else:
+            # Ensure all items in responses are strings
+            responses = [
+                str(response) if not isinstance(response, str) else response
+                for response in responses
+            ]
             new_message = {"role": "assistant", "content": "\n".join(responses)}
         if response_tool_calls:
             new_message["tool_calls"] = response_tool_calls
diff --git a/nemoguardrails/rails/llm/utils.py b/nemoguardrails/rails/llm/utils.py
@@ -23,7 +23,7 @@ def get_history_cache_key(messages: List[dict]) -> str:
         messages: The list of messages.
 
     Returns:
-        A unique string that can be used as a key for the provides sequence of messages.
+        A unique string that can be used as a key for the provided sequence of messages.
     """
     if len(messages) == 0:
         return ""
@@ -32,14 +32,27 @@ def get_history_cache_key(messages: List[dict]) -> str:
 
     for msg in messages:
         if msg["role"] == "user":
-            key_items.append(msg["content"])
+            # Check if content is a string or a list (multimodal content)
+            if isinstance(msg["content"], list):
+                # For multimodal content, join all text parts
+                text_parts = []
+                for item in msg["content"]:
+                    if item.get("type") == "text":
+                        text_parts.append(item.get("text", ""))
+                key_items.append(" ".join(text_parts))
+            else:
+                # Use the content directly without json.dumps
+                key_items.append(msg["content"])
         elif msg["role"] == "assistant":
             key_items.append(msg["content"])
         elif msg["role"] == "context":
             key_items.append(json.dumps(msg["content"]))
         elif msg["role"] == "event":
             key_items.append(json.dumps(msg["event"]))
 
+    # Ensure all items in key_items are strings
+    key_items = [str(item) if not isinstance(item, str) else item for item in key_items]
+
     history_cache_key = ":".join(key_items)
 
     return history_cache_key
diff --git a/tests/test_filters.py b/tests/test_filters.py
diff --git a/tests/test_llm_task_manager_multimodal.py b/tests/test_llm_task_manager_multimodal.py

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ async def on_chat_model_start(`
`113`	`113`	`)`
`114`	`114`	`+ "[/]"`
`115`	`115`	`+ "\n"`
`116`		`- + msg.content`
	`116`	`+ + (msg.content if isinstance(msg.content, str) else "")`
`117`	`117`	`for msg in messages[0]`
`118`	`118`	`]`
`119`	`119`	`)`