Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

subtitle agent with VideoDB Timeline & OpenAI translation #14

Merged
merged 9 commits into from
Oct 28, 2024
250 changes: 250 additions & 0 deletions backend/spielberg/agents/subtitle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
import logging
import textwrap
import json

from spielberg.agents.base import BaseAgent, AgentResponse, AgentStatus
from spielberg.core.session import (
Session,
ContextMessage,
RoleTypes,
VideoContent,
VideoData,
MsgStatus,
)
from spielberg.tools.videodb_tool import VideoDBTool
from spielberg.llm.openai import OpenAI, OpenaiConfig


from videodb.asset import VideoAsset, TextAsset, TextStyle

logger = logging.getLogger(__name__)

SUBTITLE_AGENT_PARAMETERS = {
"type": "object",
"properties": {
"video_id": {
"type": "string",
"description": "The unique identifier of the video to which subtitles will be added.",
},
"collection_id": {
"type": "string",
"description": "The unique identifier of the collection containing the video.",
},
"language": {
"type": "string",
"description": "The language the user wants the subtitles in",
},
"notes": {
"type": "string",
"description": "if user has additional requirements for the style of language",
},
},
"required": ["video_id", "collection_id"],
}


translater_prompt = """

Task Description:
---
You are provided with a transcript of a video in a compact format called compact_list to optimize context size. The transcript is presented as a single string where each word block is formatted as:

word|start|end

word: The word itself.
start: The start time of the word in the video.
end: The end time of the word in the video.

Example Input (compact_list):

[ 'hello|0|10', 'world|11|12', 'how are you|13|15']

Your Task:
---
1.Translate the Text into [TARGET LANGUAGE]:
Translate all the words in the transcript from the source language to [TARGET LANGUAGE].

2.Combine Words into Meaningful Phrases or Sentences:
Group the translated words into logical phrases or sentences that make sense together.
Ensure each group is suitable for subtitle usage—neither too long nor too short.
Adjust Timing for Each Phrase/Block.

3.For each grouped phrase or sentence:
Start Time: Use the start time of the first word in the group.
End Time: Use the end time of the last word in the group.


4.Produce the Final Output:
Provide a list of subtitle blocks in the following format:
[ {"start": 0, "end": 30, "text": "Translated block of text here"}, {"start": 31, "end": 55, "text": "Another translated block of text"}, ...]
Ensure the translated text is coherent and appropriately grouped for subtitles.

Guidelines:
---

Coherence: The translated phrases should be grammatically correct and natural in [TARGET LANGUAGE].
Subtitle Length: Each subtitle block should follow standard subtitle length guidelines (e.g., no more than two lines of text, appropriate reading speed).
Timing Accuracy: Maintain accurate synchronization with the video's audio by correctly calculating start and end times.
Don't add any quotes, or %, that makes escaping the characters difficult.


Example Output:
---
If translating to Spanish, your output might look like:

you should return json of this format
{
subtitles: [ {"start": 0, "end": 30, "text": "Bloque traducido de texto aquí"}, {"start": 31, "end": 55, "text": "Otro bloque de texto traducido"}, ...]
}

Notes:
---
Be mindful of linguistic differences that may affect how words are grouped in [TARGET LANGUAGE].
Ensure that cultural nuances and idiomatic expressions are appropriately translated.

"""


class SubtitleAgent(BaseAgent):
def __init__(self, session: Session, **kwargs):
self.agent_name = "subtitle"
self.description = "An agent designed to add different languages subtitles to a specified video within VideoDB."
self.llm = OpenAI(config=OpenaiConfig(timeout=120))
self.parameters = SUBTITLE_AGENT_PARAMETERS
super().__init__(session=session, **kwargs)

def wrap_text(self, text, video_width, max_width_percent=0.60, avg_char_width=20):
max_width_pixels = video_width * max_width_percent
max_chars_per_line = int(max_width_pixels / avg_char_width)

# Wrap the text based on the calculated max characters per line
wrapped_text = "\n".join(textwrap.wrap(text, max_chars_per_line))
wrapped_text = wrapped_text.replace("'", "")
return wrapped_text

def get_compact_transcript(self, transcript):
compact_list = []
for word_block in transcript:
word = word_block["text"]
if word == "-":
continue
start = word_block["start"]
end = word_block["end"]
compact_word = f"{word}|{start}|{end}"
compact_list.append(compact_word)
return compact_list

def add_subtitles_using_timeline(self, subtitles):
video_width = 1920
timeline = self.videodb_tool.get_and_set_timeline()
video_asset = VideoAsset(asset_id=self.video_id)
timeline.add_inline(video_asset)
for subtitle_chunk in subtitles:
start = round(subtitle_chunk["start"], 2)
end = round(subtitle_chunk["end"], 2)
duration = end - start

wrapped_text = self.wrap_text(subtitle_chunk["text"], video_width)
style = TextStyle(
fontsize=20,
fontcolor="white",
box=True,
boxcolor="black@0.6",
boxborderw="5",
y="main_h-text_h-50",
)
text_asset = TextAsset(
text=wrapped_text,
duration=duration,
style=style,
)
timeline.add_overlay(start=start, asset=text_asset)
stream_url = timeline.generate_stream()
return stream_url

def run(
self,
video_id: str,
collection_id: str,
language: str = "english",
notes: str = "",
*args,
**kwargs,
) -> AgentResponse:
"""
Adds subtitles to the specified video using the provided style configuration.

:param str video_id: The unique identifier of the video to process.
:param str collection_id: The unique identifier of the collection containing the video.
:param str language: A string specifying the language for the subtitles.
:param str notes: A String specifying the style of the language used in subtitles
:param args: Additional positional arguments.
:param kwargs: Additional keyword arguments.
:return: The response indicating the success or failure of the subtitle addition operation.
:rtype: AgentResponse
"""
try:
self.video_id = video_id
self.videodb_tool = VideoDBTool(collection_id=collection_id)

self.output_message.actions.append(
"Retrieving the subtitles in the video's original language"
)
video_content = VideoContent(
agent_name=self.agent_name,
status=MsgStatus.progress,
status_message="Processing...",
)
self.output_message.content.append(video_content)
self.output_message.push_update()

transcript = self.videodb_tool.get_transcript(video_id, text=False)
compact_transcript = self.get_compact_transcript(transcript=transcript)

self.output_message.actions.append(
f"Translating the subtitles to {language}"
)
self.output_message.push_update()
translation_llm_prompt = f"{translater_prompt} Translate to {language}, additional notes : {notes} compact_list: {compact_transcript}"
translation_llm_message = ContextMessage(
content=translation_llm_prompt,
role=RoleTypes.user,
)
llm_response = self.llm.chat_completions(
[translation_llm_message.to_llm_msg()],
response_format={"type": "json_object"},
)
translated_subtitles = json.loads(llm_response.content)

if notes:
self.output_message.actions.append(
f"Refining the language with additional notes: {notes}"
)

self.output_message.actions.append(
"Overlaying the translated subtitles onto the video"
)
self.output_message.push_update()

stream_url = self.add_subtitles_using_timeline(
translated_subtitles["subtitles"]
)
video_content.video = VideoData(stream_url=stream_url)
video_content.status = MsgStatus.success
video_content.status_message = f"Subtitles in {language} have been successfully added to your video. Here is your stream."
self.output_message.publish()

except Exception as e:
logger.exception(f"Error in {self.agent_name} agent: {e}")
video_content.status = MsgStatus.error
video_content.status_message = (
"An error occurred while adding subtitles to the video."
)
self.output_message.publish()
return AgentResponse(status=AgentStatus.ERROR, message=str(e))

return AgentResponse(
status=AgentStatus.SUCCESS,
message="Subtitles added successfully",
data={stream_url: stream_url},
)
3 changes: 3 additions & 0 deletions backend/spielberg/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
from spielberg.agents.profanity_remover import ProfanityRemoverAgent
from spielberg.agents.image_generation import ImageGenerationAgent
from spielberg.agents.stream_video import StreamVideoAgent
from spielberg.agents.subtitle import SubtitleAgent
from spielberg.agents.slack_agent import SlackAgent


from spielberg.core.session import Session, InputMessage, MsgStatus
from spielberg.core.reasoning import ReasoningEngine
from spielberg.db.base import BaseDB
Expand Down Expand Up @@ -44,6 +46,7 @@ def __init__(self, db, **kwargs):
ProfanityRemoverAgent,
ImageGenerationAgent,
StreamVideoAgent,
SubtitleAgent,
SlackAgent,
]

Expand Down
7 changes: 6 additions & 1 deletion backend/spielberg/tools/videodb_tool.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import videodb

from videodb import SearchType
from videodb import SearchType, SubtitleStyle
from videodb.timeline import Timeline
from videodb.asset import VideoAsset, ImageAsset

Expand Down Expand Up @@ -166,3 +166,8 @@ def add_brandkit(self, video_id, intro_video_id, outro_video_id, brand_image_id)
def get_and_set_timeline(self):
self.timeline = Timeline(self.conn)
return self.timeline

def add_subtitle(self, video_id, style: SubtitleStyle = SubtitleStyle()):
video = self.collection.get_video(video_id)
stream_url = video.add_subtitle(style)
return stream_url