Combine each group of 3 consecutive segments in the raw transcript

Segments in typical YouTube transcripts are very short. Grouping them can improve readability, at the cost of making it less precise which part of the transcript corresponds to the current video position. As an initial iteration of grouping, just combine every 3 segments together. Ideally it might be nice to have segments be tidy linguistic units such as sentences, but auto-generated YouTube transcripts often do not contain the information to do this. Part of #955.
hypothesis · Jun 20, 2023 · a79b4c3 · a79b4c3
1 parent d183c19
commit a79b4c3
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 2 deletions.
diff --git a/via/static/scripts/video_player/index.tsx b/via/static/scripts/video_player/index.tsx
@@ -7,6 +7,7 @@ import VideoPlayerApp from './components/VideoPlayerApp';
 import { readConfig } from './config';
 import { sampleTranscript } from './sample-transcript';
 import type { TranscriptData } from './utils/transcript';
+import { mergeSegments } from './utils/transcript';
 
 export function init() {
   const rootEl = document.querySelector('#app');
@@ -24,10 +25,13 @@ export function init() {
     video_id: videoId,
   } = readConfig();
 
-  // Pre-fetched transcript for testing. Use the video ID
+  // Pre-fetched transcript for testing. From the video
   // https://www.youtube.com/watch?v=x8TO-nrUtSI.
   const transcript: TranscriptData = { segments: sampleTranscript };
 
+  // Group segments together for better readability.
+  transcript.segments = mergeSegments(transcript.segments, 3);
+
   render(
     <VideoPlayerApp
       videoId={videoId}

diff --git a/via/static/scripts/video_player/utils/test/transcript-test.js b/via/static/scripts/video_player/utils/test/transcript-test.js
@@ -1,5 +1,9 @@
 import { sampleTranscript } from '../../sample-transcript';
-import { filterTranscript, formatTranscript } from '../transcript';
+import {
+  filterTranscript,
+  formatTranscript,
+  mergeSegments,
+} from '../transcript';
 
 describe('filterTranscript', () => {
   it('returns matching segments and offsets', () => {
@@ -33,3 +37,37 @@ you saw a playstation 1 game if you were
     );
   });
 });
+
+describe('mergeSegments', () => {
+  const captions = ['One', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven'];
+  const segments = captions.map((text, index) => ({ start: index + 1, text }));
+
+  [
+    {
+      groupSize: 1,
+      expected: segments,
+    },
+    {
+      groupSize: 2,
+      expected: [
+        { start: 1, text: 'One Two' },
+        { start: 3, text: 'Three Four' },
+        { start: 5, text: 'Five Six' },
+        { start: 7, text: 'Seven' },
+      ],
+    },
+    {
+      groupSize: 3,
+      expected: [
+        { start: 1, text: 'One Two Three' },
+        { start: 4, text: 'Four Five Six' },
+        { start: 7, text: 'Seven' },
+      ],
+    },
+  ].forEach(({ groupSize, expected }) => {
+    it('merges adjacent segments together', () => {
+      const merged = mergeSegments(segments, groupSize);
+      assert.deepEqual(merged, expected);
+    });
+  });
+});
diff --git a/via/static/scripts/video_player/utils/transcript.ts b/via/static/scripts/video_player/utils/transcript.ts
@@ -72,3 +72,23 @@ export function filterTranscript(
 export function formatTranscript(transcript: Segment[]): string {
   return transcript.map(seg => seg.text).join('\n');
 }
+
+/**
+ * Merge every group of `n` consecutive segments into a single transcript
+ * segment.
+ *
+ * This is useful for transcript sources like YouTube where each entry is short,
+ * typically just a few words, and so the transcript can be more readable if
+ * segments are grouped.
+ */
+export function mergeSegments(segments: Segment[], n: number): Segment[] {
+  return segments.reduce((merged, segment, idx) => {
+    if (idx % n !== 0) {
+      merged[merged.length - 1].text += ' ' + segment.text;
+    } else {
+      // Copy segment so we can modify in subsequent iterations.
+      merged.push({ ...segment });
+    }
+    return merged;
+  }, [] as Segment[]);
+}