Skip to content

Commit

Permalink
Combine each group of 3 consecutive segments in the raw transcript
Browse files Browse the repository at this point in the history
Segments in typical YouTube transcripts are very short. Grouping them can
improve readability, at the cost of making it less precise which part of the
transcript corresponds to the current video position. As an initial iteration
of grouping, just combine every 3 segments together.

Ideally it might be nice to have segments be tidy linguistic units such as
sentences, but auto-generated YouTube transcripts often do not contain the
information to do this.

Part of #955.
  • Loading branch information
robertknight committed Jun 20, 2023
1 parent d183c19 commit a79b4c3
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 2 deletions.
6 changes: 5 additions & 1 deletion via/static/scripts/video_player/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import VideoPlayerApp from './components/VideoPlayerApp';
import { readConfig } from './config';
import { sampleTranscript } from './sample-transcript';
import type { TranscriptData } from './utils/transcript';
import { mergeSegments } from './utils/transcript';

export function init() {
const rootEl = document.querySelector('#app');
Expand All @@ -24,10 +25,13 @@ export function init() {
video_id: videoId,
} = readConfig();

// Pre-fetched transcript for testing. Use the video ID
// Pre-fetched transcript for testing. From the video
// https://www.youtube.com/watch?v=x8TO-nrUtSI.
const transcript: TranscriptData = { segments: sampleTranscript };

// Group segments together for better readability.
transcript.segments = mergeSegments(transcript.segments, 3);

render(
<VideoPlayerApp
videoId={videoId}
Expand Down
40 changes: 39 additions & 1 deletion via/static/scripts/video_player/utils/test/transcript-test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import { sampleTranscript } from '../../sample-transcript';
import { filterTranscript, formatTranscript } from '../transcript';
import {
filterTranscript,
formatTranscript,
mergeSegments,
} from '../transcript';

describe('filterTranscript', () => {
it('returns matching segments and offsets', () => {
Expand Down Expand Up @@ -33,3 +37,37 @@ you saw a playstation 1 game if you were
);
});
});

describe('mergeSegments', () => {
const captions = ['One', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven'];
const segments = captions.map((text, index) => ({ start: index + 1, text }));

[
{
groupSize: 1,
expected: segments,
},
{
groupSize: 2,
expected: [
{ start: 1, text: 'One Two' },
{ start: 3, text: 'Three Four' },
{ start: 5, text: 'Five Six' },
{ start: 7, text: 'Seven' },
],
},
{
groupSize: 3,
expected: [
{ start: 1, text: 'One Two Three' },
{ start: 4, text: 'Four Five Six' },
{ start: 7, text: 'Seven' },
],
},
].forEach(({ groupSize, expected }) => {
it('merges adjacent segments together', () => {
const merged = mergeSegments(segments, groupSize);
assert.deepEqual(merged, expected);
});
});
});
20 changes: 20 additions & 0 deletions via/static/scripts/video_player/utils/transcript.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,23 @@ export function filterTranscript(
export function formatTranscript(transcript: Segment[]): string {
return transcript.map(seg => seg.text).join('\n');
}

/**
* Merge every group of `n` consecutive segments into a single transcript
* segment.
*
* This is useful for transcript sources like YouTube where each entry is short,
* typically just a few words, and so the transcript can be more readable if
* segments are grouped.
*/
export function mergeSegments(segments: Segment[], n: number): Segment[] {
return segments.reduce((merged, segment, idx) => {
if (idx % n !== 0) {
merged[merged.length - 1].text += ' ' + segment.text;
} else {
// Copy segment so we can modify in subsequent iterations.
merged.push({ ...segment });
}
return merged;
}, [] as Segment[]);
}

0 comments on commit a79b4c3

Please sign in to comment.