@@ -100,7 +100,7 @@ def co_v2(
100100 history += f' bot say "{ event ["script" ]} "\n '
101101
102102 elif event ["type" ] == "StartTool" :
103- s = f' await { event [" flow_name" ] } '
103+ s = f" await { event [' flow_name' ] } "
104104 for k , v in event .items ():
105105 if k in [
106106 "type" ,
@@ -275,13 +275,19 @@ def verbose_v1(colang_history: str) -> str:
275275
276276
277277def to_chat_messages (events : List [dict ]) -> str :
278- """Filter that turns an array of events into a sequence of user/assistant messages."""
278+ """Filter that turns an array of events into a sequence of user/assistant messages.
279+
280+ Properly handles multimodal content by preserving the structure when the content
281+ is in the format of a Message object with potential image_url content.
282+ """
279283 messages = []
280284 for event in events :
281285 if event ["type" ] == "UserMessage" :
282- messages .append ({"type" : "user" , "content" : event ["text" ]})
286+ # Preserve the original structure when possible to support multimodal content
287+ content = event ["text" ]
288+ messages .append ({"role" : "user" , "content" : content })
283289 elif event ["type" ] == "StartUtteranceBotAction" :
284- messages .append ({"type " : "assistant" , "content" : event ["script" ]})
290+ messages .append ({"role " : "assistant" , "content" : event ["script" ]})
285291
286292 return messages
287293
@@ -296,11 +302,30 @@ def user_assistant_sequence(events: List[dict]) -> str:
296302 User: What can you do?
297303 Assistant: I can help with many things.
298304 ```
305+
306+ For multimodal content, it extracts text content and indicates if there were images.
299307 """
300308 history_items = []
301309 for event in events :
302310 if event ["type" ] == "UserMessage" :
303- history_items .append ("User: " + event ["text" ])
311+ content = event ["text" ]
312+ # Handle multimodal content by extracting text
313+ if isinstance (content , list ):
314+ text_parts = []
315+ has_images = False
316+ for item in content :
317+ if isinstance (item , dict ):
318+ if item .get ("type" ) == "text" :
319+ text_parts .append (item .get ("text" , "" ))
320+ elif item .get ("type" ) == "image_url" :
321+ has_images = True
322+ text_content = " " .join (text_parts )
323+ if has_images :
324+ text_content += " [+ image]"
325+ history_items .append ("User: " + text_content )
326+ else :
327+ # Regular text content
328+ history_items .append ("User: " + str (content ))
304329 elif event ["type" ] == "StartUtteranceBotAction" :
305330 history_items .append ("Assistant: " + event ["script" ])
306331
@@ -375,7 +400,8 @@ def user_assistant_sequence_nemollm(events: List[dict]) -> str:
375400 history_items = []
376401 for event in events :
377402 if event ["type" ] == "UserMessage" :
378- history_items .append ("<extra_id_1>User\n " + event ["text" ])
403+ # Convert text to string regardless of type (handles both text and multimodal)
404+ history_items .append ("<extra_id_1>User\n " + str (event ["text" ]))
379405 elif event ["type" ] == "StartUtteranceBotAction" :
380406 history_items .append ("<extra_id_1>Assistant\n " + event ["script" ])
381407
0 commit comments