forked from juliuswms/NarReddit-Local
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
257 lines (209 loc) · 9.71 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import ffmpeg
import stable_whisper
from html2image import Html2Image
from openai import OpenAI
from jinja2 import Template
from gtts import gTTS
import os
import random
# temp
TTS_TITLE_PATH = 'tts/ttsTitle.mp3'
TTS_DESCRIPTION_PATH = 'tts/ttsDescription.mp3'
VIDEO_FOLDER_PATH = 'background_videos/minecraft'
SUBTITLES_PATH = 'subtitles.srt'
OUTPUT_VIDEO_PATH = 'final.mp4'
# Open AI
OPENAI_API_KEY = 'API-KEY'
GPT_MODEL = "gpt-3.5-turbo"
# Intro Card
INTRO_CARD_TEMPLATE_HTML = "intro_card_template/template.html"
INTRO_CARD_TEMPLATE_CSS = "intro_card_template/index.css"
TITLE_IMAGE_PATH = "titleCard.png"
LOGO_PATH = 'intro_card_template/logo.png'
USERNAME = 'USERNAME'
COMMENT_SVG_PATH = 'intro_card_template/comment.svg'
UPVOTE_SVG_PATH = 'intro_card_template/upvote.svg'
VIEWS_SVG_PATH = 'intro_card_template/views.svg'
# ffmpeg
VCODEC='libx264'
THREADS=4
# Generate Video
def generate_video(title_text, description_text):
# Modify Text with GPT
#title_text = get_tts_ready_text(title_text)
#description_text = get_tts_ready_text(description_text)
# Generate Image Card
generate_title_card(INTRO_CARD_TEMPLATE_HTML, title_text, LOGO_PATH, USERNAME, TITLE_IMAGE_PATH)
# Generate TTS .mp3
gtts_synthesize_text(title_text, TTS_TITLE_PATH)
gtts_synthesize_text(description_text, TTS_DESCRIPTION_PATH)
# Generate Subtitle .srt
whisper_transcribe(TTS_TITLE_PATH, TTS_DESCRIPTION_PATH, SUBTITLES_PATH)
video = process_video(TTS_TITLE_PATH, TTS_DESCRIPTION_PATH, TITLE_IMAGE_PATH, SUBTITLES_PATH)
# Merge Title & Description Audio
mergedAudio = merge_audio(TTS_TITLE_PATH, TTS_DESCRIPTION_PATH)
# Merge Video & Audio
merge_video_audio(video, mergedAudio, OUTPUT_VIDEO_PATH)
def get_tts_ready_text(text):
instructions = "Correct grammar mistakes. Don't alter curse words or swearing. Replace slashes and dashes with the appropriate word. Add punctuation as necessary for smooth speech flow. Only respond with the modified (or unmodified if no changes were made) text. Do not include any other information: "
client = OpenAI(
api_key=OPENAI_API_KEY,
)
return client.chat.completions.create(
messages=[
{
"role": "user",
"content": instructions + text,
}
],
model=GPT_MODEL,
)
def generate_title_card(html_template_path, title_text, profile_picture_path, username, output_path):
# Read Template
with open(html_template_path, "r") as f:
template = Template(f.read())
# Render HTML template with data
title_card_html = template.render(titleText=title_text, profilePicturePath='logo.png', username=username)
# Load Images
hti = Html2Image(size=(850,500))
hti.load_file(profile_picture_path)
hti.load_file(COMMENT_SVG_PATH)
hti.load_file(UPVOTE_SVG_PATH)
hti.load_file(VIEWS_SVG_PATH)
# Take "screenshot" from Title Card
hti.screenshot(html_str=title_card_html, css_file=INTRO_CARD_TEMPLATE_CSS, save_as=output_path)
def gtts_synthesize_text(text, output_file):
tts = gTTS(text, lang='en', tld='us')
tts.save(output_file)
def whisper_transcribe(tts_title_path, tts_description_path, srt_output_path):
# Transcribing Audio and generating .srt with max duration per Line of 1 sec
model = stable_whisper.load_model('base.en')
result = model.transcribe(tts_description_path, regroup=True)
(
result
.split_by_duration(1)
)
result.to_srt_vtt(srt_output_path, strip=True, word_level=False, segment_level=True)
offset = get_audio_duration(tts_title_path)
offset_srt_time(srt_output_path, srt_output_path, offset)
def offset_srt_time(input_file, output_file, offset_seconds):
with open(input_file, 'r') as f:
lines = f.readlines()
new_lines = []
for line in lines:
if '-->' in line:
parts = line.strip().split(' --> ')
start_time, end_time = parts[0], parts[1]
start_time = add_time_offset(start_time, offset_seconds)
end_time = add_time_offset(end_time, offset_seconds)
new_line = f"{start_time} --> {end_time}\n"
new_lines.append(new_line)
else:
new_lines.append(line)
with open(output_file, 'w') as f:
f.writelines(new_lines)
def add_time_offset(time_str, offset):
h, m, s_ms = map(float, time_str.replace(',', '.').split(':'))
total_seconds = h * 3600 + m * 60 + s_ms
total_seconds += offset
new_h = int(total_seconds // 3600)
total_seconds %= 3600
new_m = int(total_seconds // 60)
new_s_ms = total_seconds % 60
return f"{new_h:02d}:{new_m:02d}:{new_s_ms:06.3f}".replace('.', ',')
def process_video(title_audio_path, description_audio_path, title_image_path, subtitles_path, start_time=None, background_video_path=None):
# Get duration of Audios
title_audio_duration = get_audio_duration(title_audio_path)
description_audio_duration = get_audio_duration(description_audio_path)
#Merge Title and Description
merged_audio_duration = title_audio_duration + description_audio_duration
# Choose random Video if not specified
if background_video_path is None:
background_video_path = random_background_video()
video_probe = ffmpeg.probe(background_video_path)
video_stream = get_video_stream(video_probe)
video_duration = float(video_stream['duration'])
if start_time is None:
start_time = random_start_time(video_duration, merged_audio_duration)
video = ffmpeg.input(background_video_path, stream_loop=-1)
video = video.trim(start=start_time, end=start_time + merged_audio_duration + 0.3)
video = video.setpts('PTS-STARTPTS')
# Calculate new dimensions only when necessary
new_width, new_height = None, None
if video_stream['width'] != 9 or video_stream['height'] != 16:
new_width, new_height = get_new_dimensions(video_stream)
video = ffmpeg.filter_(video, 'crop', new_width, new_height)
# Get Image Stream of Title Image
image_stream = get_image_stream(title_image_path, new_width)
#Overlay Image
video = ffmpeg.overlay(
video, image_stream, x='(W-w)/2', y='(H-h)/2', enable=f'between(t,0,{title_audio_duration})')
if subtitles_path is not None and os.path.isfile(subtitles_path):
# Set style for the subtitles
style = "FontName=Londrina Solid,FontSize=20,PrimaryColour=&H00ffffff,OutlineColour=&H00000000," \
"BackColour=&H80000000,Bold=1,Italic=0,Alignment=10"
video = ffmpeg.filter_(
video, 'subtitles', subtitles_path, force_style=style)
return video
def random_background_video():
# Check if the folder exists
if not os.path.isdir(VIDEO_FOLDER_PATH):
print(f"Error: '{VIDEO_FOLDER_PATH}' is not a valid directory.")
return None
# Get a list of all files in the folder
video_files = [f for f in os.listdir(VIDEO_FOLDER_PATH) if os.path.isfile(os.path.join(VIDEO_FOLDER_PATH, f))]
# Check if there are any video files in the folder
if not video_files:
print(f"No video files found in '{VIDEO_FOLDER_PATH}'.")
return None
# Choose a random video file
random_video = random.choice(video_files)
# Return the path to the randomly chosen video file
return os.path.join(VIDEO_FOLDER_PATH, random_video)
def random_start_time(videoDuration, audioDuration):
# Calculate the maximum start time for the video
max_start_time = max(0, videoDuration - audioDuration)
# Choose a random start time within the valid range
random_start_time = random.uniform(0, max_start_time)
return random_start_time
def get_audio_duration(audio_path):
probe = ffmpeg.probe(audio_path)
return float(probe['streams'][0]['duration'])
def merge_video_audio(video, audio, output_video_path):
output = ffmpeg.output(video, audio, output_video_path,
vcodec=VCODEC, threads=THREADS)
output = ffmpeg.overwrite_output(output)
ffmpeg.run(output)
def get_video_stream(videoProbe):
return next((stream for stream in videoProbe['streams'] if stream['codec_type'] == 'video'), None)
def get_new_dimensions(videoStream):
width = int(videoStream['width'])
height = int(videoStream['height'])
if width / height > 9 / 16: # wider than 9:16, crop sides
return int(height * (9 / 16)), height
else: # narrower than 9:16, crop top and bottom
return width, int(width * (16 / 9))
def merge_audio(titleAudioPath, descriptionAudioPath):
titleAudio = ffmpeg.input(titleAudioPath)
descriptionAudio = ffmpeg.input(descriptionAudioPath)
# Concatenate the title and content audio files
audio_stream = ffmpeg.concat(
titleAudio, descriptionAudio, v=0, a=1)
return audio_stream
def get_image_stream(titleImagePath, videoNewWidth):
# If an image file is provided, overlay it for the first 5 seconds
if titleImagePath is not None:
image_stream = ffmpeg.input(titleImagePath)
videoWidth = min(1080, videoNewWidth)
imageWidth = min(864, int(videoWidth*0.8))
# Scale the image to match the video's dimensions if needed
image_stream = image_stream.filter_('scale', imageWidth, -1)
return image_stream
def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Remove newline characters ("\n")
text = text.replace('\n', '')
return text
# Example usage
generate_video(read_text_file('title-text.txt'), read_text_file('description-text.txt'))