Skip to content

Commit

Permalink
Rewrite transcript logic to be more generic
Browse files Browse the repository at this point in the history
The transcript logic in Invidious was written specifically
as a workaround for captions, and not transcripts as a feature.

This commit genericises the logic a bit as so it can be used for
implementing transcripts within Invidious' API and UI as well.

The most notable change is the added parsing of section headings
when it was previously skipped over in favor of regular lines.
  • Loading branch information
syeopite committed Jun 12, 2024
1 parent eda7444 commit 0224162
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 34 deletions.
9 changes: 7 additions & 2 deletions src/invidious/routes/api/v1/videos.cr
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos

if CONFIG.use_innertube_for_captions
params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
initial_data = YoutubeAPI.get_transcript(params)

webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
transcript = Invidious::Videos::Transcript.from_raw(
YoutubeAPI.get_transcript(params),
caption.language_code,
caption.auto_generated
)

webvtt = transcript.to_vtt
else
# Timedtext API handling
url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
Expand Down
86 changes: 54 additions & 32 deletions src/invidious/videos/transcript.cr
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
module Invidious::Videos
# Namespace for methods primarily relating to Transcripts
module Transcript
record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
# A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
# These lines can be categorized into two types: section headings and regular lines representing content from the video.
struct Transcript
# Types
record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
alias TranscriptLine = HeadingLine | RegularLine

property lines : Array(TranscriptLine)
property language_code : String
property auto_generated : Bool

# Initializes a new Transcript struct with the contents and associated metadata describing it
def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool)
end

# Generates a protobuf string to fetch the requested transcript from YouTube
def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
kind = auto_generated ? "asr" : ""

Expand Down Expand Up @@ -30,48 +43,57 @@ module Invidious::Videos
return params
end

def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
# Convert into array of TranscriptLine
lines = self.parse(initial_data)

settings_field = {
"Kind" => "captions",
"Language" => target_language,
}

# Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
vtt = WebVTT.build(settings_field) do |vtt|
lines.each do |line|
vtt.cue(line.start_ms, line.end_ms, line.line)
end
end

return vtt
end

private def self.parse(initial_data : Hash(String, JSON::Any))
# Constructs a Transcripts struct from the initial YouTube response
def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
"content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
"initialSegments").as_a

lines = [] of TranscriptLine

body.each do |line|
# Transcript section headers. They are not apart of the captions and as such we can safely skip them.
if line.as_h.has_key?("transcriptSectionHeaderRenderer")
next
if unpacked_line = line["transcriptSectionHeaderRenderer"]?
line_type = HeadingLine
else
unpacked_line = line["transcriptSegmentRenderer"]
line_type = RegularLine
end

line = line["transcriptSegmentRenderer"]
start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
text = extract_text(unpacked_line["snippet"]) || ""

start_ms = line["startMs"].as_s.to_i.millisecond
end_ms = line["endMs"].as_s.to_i.millisecond
lines << line_type.new(start_ms, end_ms, text)
end

text = extract_text(line["snippet"]) || ""
return Transcript.new(
lines: lines,
language_code: language_code,
auto_generated: auto_generated,
)
end

lines << TranscriptLine.new(start_ms, end_ms, text)
# Converts transcript lines to a WebVTT file
#
# This is used within Invidious to replace subtitles
# as to workaround YouTube's rate-limited timedtext endpoint.
def to_vtt
settings_field = {
"Kind" => "captions",
"Language" => @language_code,
}

vtt = WebVTT.build(settings_field) do |vtt|
@lines.each do |line|
# Section headers are excluded from the VTT conversion as to
# match the regular captions returned from YouTube as much as possible
next if line.is_a? HeadingLine

vtt.cue(line.start_ms, line.end_ms, line.line)
end
end

return lines
return vtt
end
end
end

0 comments on commit 0224162

Please sign in to comment.