agno-agi · dirkbrnd · Jan 17, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 16, 2025
diff --git a/cookbook/agent_concepts/agent_multimodal/audio_input_output.py b/cookbook/agent_concepts/agent_multimodal/audio_input_output.py
@@ -1,6 +1,7 @@
 import base64
 import requests
 from agno.agent import Agent
+from agno.media import AudioInput
 from agno.models.openai import OpenAIChat
 from agno.utils.audio import write_audio_to_file
 
@@ -9,7 +10,6 @@
 response = requests.get(url)
 response.raise_for_status()
 wav_data = response.content
-encoded_string = base64.b64encode(wav_data).decode("utf-8")
 
 agent = Agent(
     model=OpenAIChat(
@@ -20,8 +20,8 @@
 
 agent.run(
     "What's in these recording?",
-    audio={"data": encoded_string, "format": "wav"},
+    audio=[AudioInput(content=wav_data, format="wav")],
 )
 
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
+if agent.run_response.response_audio is not None :
+    write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/result.wav")
diff --git a/cookbook/agent_concepts/agent_multimodal/audio_multi_turn.py b/cookbook/agent_concepts/agent_multimodal/audio_multi_turn.py
@@ -11,9 +11,9 @@
 )
 
 agent.run("Is a golden retriever a good family dog?")
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav")
+if agent.run_response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/answer_1.wav")
 
 agent.run("Why do you say they are loyal?")
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav")
+if agent.run_response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/answer_2.wav")
diff --git a/cookbook/agent_concepts/agent_multimodal/generate_image_with_intermediate_steps.py b/cookbook/agent_concepts/agent_multimodal/generate_image_with_intermediate_steps.py
@@ -3,6 +3,7 @@
 from agno.agent import Agent, RunResponse
 from agno.models.openai import OpenAIChat
 from agno.tools.dalle import DalleTools
+from agno.utils.common import dataclass_to_dict
 
 image_agent = Agent(
     model=OpenAIChat(id="gpt-4o"),
@@ -22,5 +23,5 @@
     stream_intermediate_steps=True,
 )
 for chunk in run_stream:
-    pprint(chunk.model_dump(exclude={"messages"}))
+    pprint(dataclass_to_dict(chunk, exclude={"messages"}))
     print("---" * 20)
diff --git a/cookbook/agent_concepts/agent_multimodal/image_to_audio.py b/cookbook/agent_concepts/agent_multimodal/image_to_audio.py
@@ -3,15 +3,18 @@
 from rich.text import Text
 
 from agno.agent import Agent, RunResponse
+from agno.media import ImageInput
 from agno.models.openai import OpenAIChat
 from agno.utils.audio import write_audio_to_file
 
 cwd = Path(__file__).parent.resolve()
 
 image_agent = Agent(model=OpenAIChat(id="gpt-4o"))
+
+image_path = Path(__file__).parent.joinpath("multimodal-agents.jpg")
 image_story: RunResponse = image_agent.run(
     "Write a 3 sentence fiction story about the image",
-    images=[str(cwd.joinpath("multimodal-agents.jpg"))],
+    images=[ImageInput(filepath=image_path)],
 )
 formatted_text = Text.from_markup(f":sparkles: [bold magenta]Story:[/bold magenta] {image_story.content} :sparkles:")
 print(formatted_text)
@@ -23,5 +26,5 @@
 )
 
 audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}")
-if audio_story.response_audio is not None and "data" in audio_story.response_audio:
-    write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav")
+if audio_story.response_audio is not None:
+    write_audio_to_file(audio=audio_story.response_audio.content, filename="tmp/multimodal-agents.wav")
diff --git a/cookbook/agent_concepts/agent_multimodal/image_to_text.py b/cookbook/agent_concepts/agent_multimodal/image_to_text.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 from agno.agent import Agent
+from agno.media import ImageInput
 from agno.models.openai import OpenAIChat
 
 agent = Agent(
@@ -11,5 +12,5 @@
 image_path = Path(__file__).parent.joinpath("multimodal-agents.jpg")
 agent.print_response(
     "Write a 3 sentence fiction story about the image",
-    images=[str(image_path)],
+    images=[ImageInput(filepath=image_path)],
 )
diff --git a/cookbook/agent_concepts/agent_multimodal/video_to_shorts.py b/cookbook/agent_concepts/agent_multimodal/video_to_shorts.py
@@ -11,6 +11,7 @@
 from google.generativeai import upload_file, get_file
 
 from agno.agent import Agent
+from agno.media import VideoInput
 from agno.models.google import Gemini
 from agno.utils.log import logger
 
@@ -73,7 +74,7 @@
 """
 
 # 4. Generate Video Analysis
-response = agent.run(query, videos=[video_file])
+response = agent.run(query, videos=[VideoInput(content=video_file)])
 
 # 5. Create output directory
 output_dir = Path(output_dir)

diff --git a/cookbook/agent_concepts/tools/dalle.py b/cookbook/agent_concepts/tools/dalle.py
@@ -2,7 +2,7 @@
 
 from pathlib import Path
 
-from agno.utils.images import download_image
+from agno.utils.media import download_image
 from agno.agent import Agent
 from agno.tools.dalle import DalleTools
 

diff --git a/cookbook/getting_started/03_image_agent.py b/cookbook/getting_started/03_image_agent.py
@@ -2,6 +2,7 @@
 Run `pip install duckduckgo-search` to install dependencies."""
 
 from agno.agent import Agent
+from agno.media import ImageInput
 from agno.models.openai import OpenAIChat
 from agno.tools.duckduckgo import DuckDuckGoTools
 
@@ -29,7 +30,7 @@
 agent.print_response(
     "Tell me about this image and give me the latest news about its city.",
     images=[
-        "https://upload.wikimedia.org/wikipedia/commons/a/ab/Empire_State_Building_From_Rooftop_2019-10-05_19-11.jpg",
+        ImageInput(url="https://upload.wikimedia.org/wikipedia/commons/a/ab/Empire_State_Building_From_Rooftop_2019-10-05_19-11.jpg")
     ],
     stream=True,
 )
diff --git a/cookbook/models/google/audio_agent.py b/cookbook/models/google/audio_agent.py
diff --git a/cookbook/models/google/audio_input.py b/cookbook/models/google/audio_input.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+import requests
+
+from agno.agent import Agent
+from agno.media import AudioInput
+from agno.models.google import Gemini
+
+agent = Agent(
+    model=Gemini(id="gemini-2.0-flash-exp"),
+    markdown=True,
+)
+
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+
+# Download the audio file from the URL as bytes
+response = requests.get(url)
+audio_content = response.content
+
+agent.print_response(
+    "Tell me about this audio",
+    audio=[AudioInput(content=audio_content)],
+    stream=True,
+)
diff --git a/.../models/google/audio_agent_file_upload.py → .../models/google/audio_input_file_upload.py b/.../models/google/audio_agent_file_upload.py → .../models/google/audio_input_file_upload.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 from agno.agent import Agent
+from agno.media import AudioInput
 from agno.models.google import Gemini
 from google.generativeai import upload_file
 
@@ -10,12 +11,12 @@
 )
 
 # Please download a sample audio file to test this Agent and upload using:
-audio_path = Path(__file__).parent.joinpath("sample_audio.mp3")
+audio_path = Path(__file__).parent.joinpath("sample.mp3")
 audio_file = upload_file(audio_path)
 print(f"Uploaded audio: {audio_file}")
 
 agent.print_response(
     "Tell me about this audio",
-    audio=audio_file,
+    audio=[AudioInput(content=audio_file)],
     stream=True,
 )
diff --git a/cookbook/models/google/image_agent.py → cookbook/models/google/image_input.py b/cookbook/models/google/image_agent.py → cookbook/models/google/image_input.py
diff --git a/.../models/google/image_agent_file_upload.py → .../models/google/image_input_file_upload.py b/.../models/google/image_agent_file_upload.py → .../models/google/image_input_file_upload.py
diff --git a/cookbook/models/google/video_agent.py b/cookbook/models/google/video_agent.py
@@ -1,9 +1,8 @@
-import time
 from pathlib import Path
 
 from agno.agent import Agent
+from agno.media import VideoInput
 from agno.models.google import Gemini
-from google.generativeai import upload_file, get_file
 
 agent = Agent(
     model=Gemini(id="gemini-2.0-flash-exp"),
@@ -13,12 +12,6 @@
 # Please download "GreatRedSpot.mp4" using
 # wget https://storage.googleapis.com/generativeai-downloads/images/GreatRedSpot.mp4
 video_path = Path(__file__).parent.joinpath("GreatRedSpot.mp4")
-video_file = upload_file(video_path)
-# Check whether the file is ready to be used.
-while video_file.state.name == "PROCESSING":
-    time.sleep(2)
-    video_file = get_file(video_file.name)
 
-print(f"Uploaded video: {video_file}")
-
-agent.print_response("Tell me about this video", videos=[video_file], stream=True)
+agent.print_response("Tell me about this video",
+                     videos=[VideoInput(filepath=video_path)])
diff --git a/cookbook/models/google/video_agent_file_upload.py b/cookbook/models/google/video_agent_file_upload.py
@@ -0,0 +1,26 @@
+import time
+from pathlib import Path
+
+from agno.agent import Agent
+from agno.media import VideoInput
+from agno.models.google import Gemini
+from google.generativeai import upload_file, get_file
+
+agent = Agent(
+    model=Gemini(id="gemini-2.0-flash-exp"),
+    markdown=True,
+)
+
+# Please download "GreatRedSpot.mp4" using
+# wget https://storage.googleapis.com/generativeai-downloads/images/GreatRedSpot.mp4
+video_path = Path(__file__).parent.joinpath("GreatRedSpot.mp4")
+video_file = upload_file(video_path)
+# Check whether the file is ready to be used.
+while video_file.state.name == "PROCESSING":
+    print("Checking:", video_file.name)
+    time.sleep(2)
+    video_file = get_file(video_file.name)
+
+print(f"Uploaded video: {video_file}")
+
+agent.print_response("Tell me about this video", videos=[VideoInput(content=video_file)], stream=True)
diff --git a/cookbook/models/openai/audio_input_agent.py b/cookbook/models/openai/audio_input_agent.py
@@ -1,18 +1,18 @@
 import base64
 import requests
 from agno.agent import Agent, RunResponse  # noqa
+from agno.media import AudioInput
 from agno.models.openai import OpenAIChat
 
 # Fetch the audio file and convert it to a base64 encoded string
 url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
 response = requests.get(url)
 response.raise_for_status()
 wav_data = response.content
-encoded_string = base64.b64encode(wav_data).decode("utf-8")
 
 # Provide the agent with the audio file and get result as text
 agent = Agent(
     model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]),
     markdown=True,
 )
-agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
+agent.print_response("What is in this audio?", audio=[AudioInput(content=wav_data, format="wav")])
diff --git a/cookbook/models/openai/audio_input_output_output.py b/cookbook/models/openai/audio_input_output_output.py
@@ -0,0 +1,27 @@
+import base64
+import requests
+from agno.agent import Agent
+from agno.media import AudioInput
+from agno.models.openai import OpenAIChat
+from agno.utils.audio import write_audio_to_file
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+
+agent = Agent(
+    model=OpenAIChat(
+        id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"}
+    ),
+    markdown=True,
+)
+
+agent.run(
+    "What's in these recording?",
+    audio=[AudioInput(content=wav_data, format="wav")]
+)
+
+if agent.run_response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/result.wav")
diff --git a/cookbook/models/openai/audio_multi_input_agent.py b/cookbook/models/openai/audio_multi_input_agent.py
@@ -0,0 +1,22 @@
+import requests
+from agno.agent import Agent, RunResponse  # noqa
+from agno.media import AudioInput
+from agno.models.openai import OpenAIChat
+
+# Fetch the audio file and convert it to a base64 encoded string
+url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
+response = requests.get(url)
+response.raise_for_status()
+wav_data = response.content
+
+# Provide the agent with the audio file and get result as text
+agent = Agent(
+    model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"},),
+    markdown=True,
+    add_history_to_messages=True,
+    num_history_responses=3,
+    debug_mode=True
+)
+agent.print_response("What is in this audio?", audio=[AudioInput(content=wav_data, format="wav")])
+
+agent.print_response("What else can you tell me about it?")
diff --git a/cookbook/models/openai/audio_output_agent.py b/cookbook/models/openai/audio_output_agent.py
@@ -1,15 +1,7 @@
-import base64
-import requests
 from agno.agent import Agent, RunResponse  # noqa
 from agno.models.openai import OpenAIChat
 from agno.utils.audio import write_audio_to_file
 
-# Fetch the audio file and convert it to a base64 encoded string
-url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
-response = requests.get(url)
-response.raise_for_status()
-wav_data = response.content
-encoded_string = base64.b64encode(wav_data).decode("utf-8")
 
 # Provide the agent with the audio file and audio configuration and get result as text + audio
 agent = Agent(
@@ -18,8 +10,8 @@
     ),
     markdown=True,
 )
-agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
+response: RunResponse = agent.run("Tell me a 5 second scary story")
 
 # Save the response audio to a file
-if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
-    write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
+if response.response_audio is not None:
+    write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/scary_story.wav")
diff --git a/cookbook/models/openai/image_agent_bytes.py b/cookbook/models/openai/image_agent_bytes.py
@@ -0,0 +1,26 @@
+from pathlib import Path
+
+from agno.agent import Agent
+from agno.media import ImageInput
+from agno.models.openai import OpenAIChat
+from agno.tools.duckduckgo import DuckDuckGoTools
+
+agent = Agent(
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[DuckDuckGoTools()],
+    markdown=True,
+)
+
+image_path = Path(__file__).parent.joinpath("sample.jpg")
+
+# Read the image file content as bytes
+with open(image_path, "rb") as img_file:
+    image_bytes = img_file.read()
+
+agent.print_response(
+    "Tell me about this image and give me the latest news about it.",
+    images=[
+        ImageInput(content=image_bytes),
+    ],
+    stream=True,
+)