Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Audio and Video interface updates #1783

Merged
merged 7 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import requests
from agno.agent import Agent
from agno.media import AudioInput
from agno.models.openai import OpenAIChat
from agno.utils.audio import write_audio_to_file

Expand All @@ -9,7 +10,6 @@
response = requests.get(url)
response.raise_for_status()
wav_data = response.content
encoded_string = base64.b64encode(wav_data).decode("utf-8")

agent = Agent(
model=OpenAIChat(
Expand All @@ -20,8 +20,8 @@

agent.run(
"What's in these recording?",
audio={"data": encoded_string, "format": "wav"},
audio=[AudioInput(content=wav_data, format="wav")],
)

if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
if agent.run_response.response_audio is not None :
write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/result.wav")
8 changes: 4 additions & 4 deletions cookbook/agent_concepts/agent_multimodal/audio_multi_turn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
)

agent.run("Is a golden retriever a good family dog?")
if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_1.wav")
if agent.run_response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/answer_1.wav")

agent.run("Why do you say they are loyal?")
if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/answer_2.wav")
if agent.run_response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/answer_2.wav")
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from agno.agent import Agent, RunResponse
from agno.models.openai import OpenAIChat
from agno.tools.dalle import DalleTools
from agno.utils.common import dataclass_to_dict

image_agent = Agent(
model=OpenAIChat(id="gpt-4o"),
Expand All @@ -22,5 +23,5 @@
stream_intermediate_steps=True,
)
for chunk in run_stream:
pprint(chunk.model_dump(exclude={"messages"}))
pprint(dataclass_to_dict(chunk, exclude={"messages"}))
print("---" * 20)
9 changes: 6 additions & 3 deletions cookbook/agent_concepts/agent_multimodal/image_to_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@
from rich.text import Text

from agno.agent import Agent, RunResponse
from agno.media import ImageInput
from agno.models.openai import OpenAIChat
from agno.utils.audio import write_audio_to_file

cwd = Path(__file__).parent.resolve()

image_agent = Agent(model=OpenAIChat(id="gpt-4o"))

image_path = Path(__file__).parent.joinpath("multimodal-agents.jpg")
image_story: RunResponse = image_agent.run(
"Write a 3 sentence fiction story about the image",
images=[str(cwd.joinpath("multimodal-agents.jpg"))],
images=[ImageInput(filepath=image_path)],
)
formatted_text = Text.from_markup(f":sparkles: [bold magenta]Story:[/bold magenta] {image_story.content} :sparkles:")
print(formatted_text)
Expand All @@ -23,5 +26,5 @@
)

audio_story: RunResponse = audio_agent.run(f"Narrate the story with flair: {image_story.content}")
if audio_story.response_audio is not None and "data" in audio_story.response_audio:
write_audio_to_file(audio=audio_story.response_audio["data"], filename="tmp/multimodal-agents.wav")
if audio_story.response_audio is not None:
write_audio_to_file(audio=audio_story.response_audio.content, filename="tmp/multimodal-agents.wav")
3 changes: 2 additions & 1 deletion cookbook/agent_concepts/agent_multimodal/image_to_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path

from agno.agent import Agent
from agno.media import ImageInput
from agno.models.openai import OpenAIChat

agent = Agent(
Expand All @@ -11,5 +12,5 @@
image_path = Path(__file__).parent.joinpath("multimodal-agents.jpg")
agent.print_response(
"Write a 3 sentence fiction story about the image",
images=[str(image_path)],
images=[ImageInput(filepath=image_path)],
)
3 changes: 2 additions & 1 deletion cookbook/agent_concepts/agent_multimodal/video_to_shorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from google.generativeai import upload_file, get_file

from agno.agent import Agent
from agno.media import VideoInput
from agno.models.google import Gemini
from agno.utils.log import logger

Expand Down Expand Up @@ -73,7 +74,7 @@
"""

# 4. Generate Video Analysis
response = agent.run(query, videos=[video_file])
response = agent.run(query, videos=[VideoInput(content=video_file)])

# 5. Create output directory
output_dir = Path(output_dir)
Expand Down
2 changes: 1 addition & 1 deletion cookbook/agent_concepts/tools/dalle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pathlib import Path

from agno.utils.images import download_image
from agno.utils.media import download_image
from agno.agent import Agent
from agno.tools.dalle import DalleTools

Expand Down
3 changes: 2 additions & 1 deletion cookbook/getting_started/03_image_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Run `pip install duckduckgo-search` to install dependencies."""

from agno.agent import Agent
from agno.media import ImageInput
from agno.models.openai import OpenAIChat
from agno.tools.duckduckgo import DuckDuckGoTools

Expand Down Expand Up @@ -29,7 +30,7 @@
agent.print_response(
"Tell me about this image and give me the latest news about its city.",
images=[
"https://upload.wikimedia.org/wikipedia/commons/a/ab/Empire_State_Building_From_Rooftop_2019-10-05_19-11.jpg",
ImageInput(url="https://upload.wikimedia.org/wikipedia/commons/a/ab/Empire_State_Building_From_Rooftop_2019-10-05_19-11.jpg")
],
stream=True,
)
21 changes: 0 additions & 21 deletions cookbook/models/google/audio_agent.py

This file was deleted.

24 changes: 24 additions & 0 deletions cookbook/models/google/audio_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pathlib import Path

import requests

from agno.agent import Agent
from agno.media import AudioInput
from agno.models.google import Gemini

agent = Agent(
model=Gemini(id="gemini-2.0-flash-exp"),
markdown=True,
)

url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"

# Download the audio file from the URL as bytes
response = requests.get(url)
audio_content = response.content

agent.print_response(
"Tell me about this audio",
audio=[AudioInput(content=audio_content)],
stream=True,
)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path

from agno.agent import Agent
from agno.media import AudioInput
from agno.models.google import Gemini
from google.generativeai import upload_file

Expand All @@ -10,12 +11,12 @@
)

# Please download a sample audio file to test this Agent and upload using:
audio_path = Path(__file__).parent.joinpath("sample_audio.mp3")
audio_path = Path(__file__).parent.joinpath("sample.mp3")
audio_file = upload_file(audio_path)
print(f"Uploaded audio: {audio_file}")

agent.print_response(
"Tell me about this audio",
audio=audio_file,
audio=[AudioInput(content=audio_file)],
stream=True,
)
13 changes: 3 additions & 10 deletions cookbook/models/google/video_agent.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import time
from pathlib import Path

from agno.agent import Agent
from agno.media import VideoInput
from agno.models.google import Gemini
from google.generativeai import upload_file, get_file

agent = Agent(
model=Gemini(id="gemini-2.0-flash-exp"),
Expand All @@ -13,12 +12,6 @@
# Please download "GreatRedSpot.mp4" using
# wget https://storage.googleapis.com/generativeai-downloads/images/GreatRedSpot.mp4
video_path = Path(__file__).parent.joinpath("GreatRedSpot.mp4")
video_file = upload_file(video_path)
# Check whether the file is ready to be used.
while video_file.state.name == "PROCESSING":
time.sleep(2)
video_file = get_file(video_file.name)

print(f"Uploaded video: {video_file}")

agent.print_response("Tell me about this video", videos=[video_file], stream=True)
agent.print_response("Tell me about this video",
videos=[VideoInput(filepath=video_path)])
26 changes: 26 additions & 0 deletions cookbook/models/google/video_agent_file_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import time
from pathlib import Path

from agno.agent import Agent
from agno.media import VideoInput
from agno.models.google import Gemini
from google.generativeai import upload_file, get_file

agent = Agent(
model=Gemini(id="gemini-2.0-flash-exp"),
markdown=True,
)

# Please download "GreatRedSpot.mp4" using
# wget https://storage.googleapis.com/generativeai-downloads/images/GreatRedSpot.mp4
video_path = Path(__file__).parent.joinpath("GreatRedSpot.mp4")
video_file = upload_file(video_path)
# Check whether the file is ready to be used.
while video_file.state.name == "PROCESSING":
print("Checking:", video_file.name)
time.sleep(2)
video_file = get_file(video_file.name)

print(f"Uploaded video: {video_file}")

agent.print_response("Tell me about this video", videos=[VideoInput(content=video_file)], stream=True)
4 changes: 2 additions & 2 deletions cookbook/models/openai/audio_input_agent.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import base64
import requests
from agno.agent import Agent, RunResponse # noqa
from agno.media import AudioInput
from agno.models.openai import OpenAIChat

# Fetch the audio file and convert it to a base64 encoded string
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
wav_data = response.content
encoded_string = base64.b64encode(wav_data).decode("utf-8")

# Provide the agent with the audio file and get result as text
agent = Agent(
model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]),
markdown=True,
)
agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
agent.print_response("What is in this audio?", audio=[AudioInput(content=wav_data, format="wav")])
27 changes: 27 additions & 0 deletions cookbook/models/openai/audio_input_output_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import base64
import requests
from agno.agent import Agent
from agno.media import AudioInput
from agno.models.openai import OpenAIChat
from agno.utils.audio import write_audio_to_file

# Fetch the audio file and convert it to a base64 encoded string
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
wav_data = response.content

agent = Agent(
model=OpenAIChat(
id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"}
),
markdown=True,
)

agent.run(
"What's in these recording?",
audio=[AudioInput(content=wav_data, format="wav")]
)

if agent.run_response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/result.wav")
22 changes: 22 additions & 0 deletions cookbook/models/openai/audio_multi_input_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import requests
from agno.agent import Agent, RunResponse # noqa
from agno.media import AudioInput
from agno.models.openai import OpenAIChat

# Fetch the audio file and convert it to a base64 encoded string
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
wav_data = response.content

# Provide the agent with the audio file and get result as text
agent = Agent(
model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"},),
markdown=True,
add_history_to_messages=True,
num_history_responses=3,
debug_mode=True
)
agent.print_response("What is in this audio?", audio=[AudioInput(content=wav_data, format="wav")])

agent.print_response("What else can you tell me about it?")
14 changes: 3 additions & 11 deletions cookbook/models/openai/audio_output_agent.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
import base64
import requests
from agno.agent import Agent, RunResponse # noqa
from agno.models.openai import OpenAIChat
from agno.utils.audio import write_audio_to_file

# Fetch the audio file and convert it to a base64 encoded string
url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
response = requests.get(url)
response.raise_for_status()
wav_data = response.content
encoded_string = base64.b64encode(wav_data).decode("utf-8")

# Provide the agent with the audio file and audio configuration and get result as text + audio
agent = Agent(
Expand All @@ -18,8 +10,8 @@
),
markdown=True,
)
agent.print_response("What is in this audio?", audio={"data": encoded_string, "format": "wav"})
response: RunResponse = agent.run("Tell me a 5 second scary story")

# Save the response audio to a file
if agent.run_response.response_audio is not None and "data" in agent.run_response.response_audio:
write_audio_to_file(audio=agent.run_response.response_audio["data"], filename="tmp/dog.wav")
if response.response_audio is not None:
write_audio_to_file(audio=agent.run_response.response_audio.content, filename="tmp/scary_story.wav")
26 changes: 26 additions & 0 deletions cookbook/models/openai/image_agent_bytes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from pathlib import Path

from agno.agent import Agent
from agno.media import ImageInput
from agno.models.openai import OpenAIChat
from agno.tools.duckduckgo import DuckDuckGoTools

agent = Agent(
model=OpenAIChat(id="gpt-4o"),
tools=[DuckDuckGoTools()],
markdown=True,
)

image_path = Path(__file__).parent.joinpath("sample.jpg")

# Read the image file content as bytes
with open(image_path, "rb") as img_file:
image_bytes = img_file.read()

agent.print_response(
"Tell me about this image and give me the latest news about it.",
images=[
ImageInput(content=image_bytes),
],
stream=True,
)
Loading
Loading