videojs · adrums86 · Oct 29, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 15, 2024
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -63,7 +63,7 @@
     "global": "^4.4.0",
     "m3u8-parser": "^7.2.0",
     "mpd-parser": "^1.3.1",
-    "mux.js": "7.0.3",
+    "mux.js": "7.1.0",
     "video.js": "^7 || ^8"
   },
   "peerDependencies": {

diff --git a/src/media-segment-request.js b/src/media-segment-request.js
@@ -18,6 +18,8 @@ export const REQUEST_ERRORS = {
   ABORTED: -102
 };
 
+const WEB_VTT_CODEC = 'wvtt';
+
 /**
  * Abort all requests
  *
@@ -164,6 +166,43 @@ const handleKeyResponse = (segment, objects, finishProcessingFn, triggerSegmentE
   return finishProcessingFn(null, segment);
 };
 
+/**
+ * Processes an mp4 init segment depending on the codec through the transmuxer.
+ *
+ * @param {Object} segment init segment to process
+ * @param {string} codec the codec of the text segments
+ */
+const initMp4Text = (segment, codec) => {
+  if (codec === WEB_VTT_CODEC) {
+    segment.transmuxer.postMessage({
+      action: 'initMp4WebVttParser',
+      data: segment.map.bytes
+    });
+  }
+};
+
+/**
+ * Parses an mp4 text segment with the transmuxer and calls the doneFn from
+ * the segment loader.
+ *
+ * @param {Object} segment the text segment to parse
+ * @param {string} codec the codec of the text segment
+ * @param {Function} doneFn the doneFn passed from the segment loader
+ */
+const parseMp4TextSegment = (segment, codec, doneFn) => {
+  if (codec === WEB_VTT_CODEC) {
+    workerCallback({
+      action: 'getMp4WebVttText',
+      data: segment.bytes,
+      transmuxer: segment.transmuxer,
+      callback: ({data, mp4VttCues}) => {
+        segment.bytes = data;
+        doneFn(null, segment, { mp4VttCues });
+      }
+    });
+  }
+};
+
 const parseInitSegment = (segment, callback) => {
   const type = detectContainerForBytes(segment.map.bytes);
 
@@ -206,6 +245,10 @@ const parseInitSegment = (segment, callback) => {
           segment.map.timescales[track.id] = track.timescale;
         }
 
+        if (track.type === 'text') {
+          initMp4Text(segment, track.codec);
+        }
+
       });
 
       return callback(null);
@@ -468,6 +511,16 @@ const handleSegmentBytes = ({
   if (isLikelyFmp4MediaSegment(bytesAsUint8Array)) {
     segment.isFmp4 = true;
     const {tracks} = segment.map;
+    const isMp4TextSegment = tracks.text && (!tracks.audio || !tracks.video);
+
+    if (isMp4TextSegment) {
+      dataFn(segment, {
+        data: bytesAsUint8Array,
+        type: 'text'
+      });
+      parseMp4TextSegment(segment, tracks.text.codec, doneFn);
+      return;
+    }
 
     const trackInfo = {
       isFmp4: true,

diff --git a/src/transmuxer-worker.js b/src/transmuxer-worker.js
@@ -16,6 +16,7 @@
 
 import {Transmuxer} from 'mux.js/lib/mp4/transmuxer';
 import CaptionParser from 'mux.js/lib/mp4/caption-parser';
+import WebVttParser from 'mux.js/lib/mp4/webvtt-parser';
 import mp4probe from 'mux.js/lib/mp4/probe';
 import tsInspector from 'mux.js/lib/tools/ts-inspector.js';
 import {
@@ -207,6 +208,44 @@
     }, [segment.buffer]);
   }
 
+  /**
+   * Initializes the WebVttParser and passes the init segment.
+   *
+   * @param {Uint8Array} data mp4 boxed WebVTT init segment data
+   */
+  initMp4WebVttParser(data) {
+    if (!this.webVttParser) {
+      this.webVttParser = new WebVttParser();
+    }
+    const segment = new Uint8Array(data.data, data.byteOffset, data.byteLength);
+
+    // Set the timescale for the parser.
+    // This can be called repeatedly in order to set and re-set the timescale.
+    this.webVttParser.init(segment);
+  }
+
+  /**
+   * Parse an mp4 encapsulated WebVTT segment and return an array of cues.
+   *
+   * @param {Uint8Array} data a text/webvtt segment
+   * @return {Object[]} an array of parsed cue objects
+   */
+  getMp4WebVttText(data) {
+    if (!this.webVttParser) {
+      // timescale might not be set yet if the parser is created before an init segment is passed.
+      // default timescale is 90k.
+      this.webVttParser = new WebVttParser();
+    }
+    const segment = new Uint8Array(data.data, data.byteOffset, data.byteLength);
+    const parsed = this.webVttParser.parseSegment(segment);
+
+    this.self.postMessage({
+      action: 'getMp4WebVttText',
+      mp4VttCues: parsed || [],
+      data: segment.buffer
+    }, [segment.buffer]);
+  }
+
   probeMp4StartTime({timescales, data}) {
     const startTime = mp4probe.startTime(timescales, data);
 

diff --git a/src/vtt-segment-loader.js b/src/vtt-segment-loader.js
@@ -46,11 +46,6 @@ export default class VTTSegmentLoader extends SegmentLoader {
     this.shouldSaveSegmentTimingInfo_ = false;
   }
 
-  createTransmuxer_() {
-    // don't need to transmux any subtitles
-    return null;
-  }
-
   /**
    * Indicates which time ranges are buffered
    *
@@ -282,6 +277,11 @@ export default class VTTSegmentLoader extends SegmentLoader {
     }
 
     const segmentInfo = this.pendingSegment_;
+    const isMp4WebVttSegmentWithCues = result.mp4VttCues && result.mp4VttCues.length;
+
+    if (isMp4WebVttSegmentWithCues) {
+      segmentInfo.mp4VttCues = result.mp4VttCues;
+    }
 
     // although the VTT segment loader bandwidth isn't really used, it's good to
     // maintain functionality between segment loaders
@@ -334,11 +334,13 @@ export default class VTTSegmentLoader extends SegmentLoader {
       return;
     }
 
-    this.updateTimeMapping_(
-      segmentInfo,
-      this.syncController_.timelines[segmentInfo.timeline],
-      this.playlist_
-    );
+    if (!isMp4WebVttSegmentWithCues) {
+      this.updateTimeMapping_(
+        segmentInfo,
+        this.syncController_.timelines[segmentInfo.timeline],
+        this.playlist_
+      );
+    }
 
     if (segmentInfo.cues.length) {
       segmentInfo.timingInfo = {
@@ -380,14 +382,49 @@ export default class VTTSegmentLoader extends SegmentLoader {
     this.handleAppendsDone_();
   }
 
-  handleData_() {
-    // noop as we shouldn't be getting video/audio data captions
-    // that we do not support here.
+  handleData_(simpleSegment, result) {
+    const isVttType = simpleSegment && simpleSegment.type === 'vtt';
+    const isTextResult = result && result.type === 'text';
+    const isFmp4VttSegment = isVttType && isTextResult;
+    // handle segment data for fmp4 encapsulated webvtt
+
+    if (isFmp4VttSegment) {
+      super.handleData_(simpleSegment, result);
+    }
   }
+
   updateTimingInfoEnd_() {
     // noop
   }
 
+  /**
+   * Utility function for converting mp4 webvtt cue objects into VTTCues.
+   *
+   * @param {Object} segmentInfo with mp4 webvtt cues for parsing into VTTCue objecs
+   */
+  parseMp4VttCues_(segmentInfo) {
+    const timestampOffset = this.sourceUpdater_.videoTimestampOffset() === null ?
+      this.sourceUpdater_.audioTimestampOffset() :
+      this.sourceUpdater_.videoTimestampOffset();
+
+    segmentInfo.mp4VttCues.forEach((cue) => {
+      const start = cue.start + timestampOffset;
+      const end = cue.end + timestampOffset;
+      const vttCue = new window.VTTCue(start, end, cue.cueText);
+
+      if (cue.settings) {
+        cue.settings.split(' ').forEach((cueSetting) => {
+          const keyValString = cueSetting.split(':');
+          const key = keyValString[0];
+          const value = keyValString[1];
+
+          vttCue[key] = isNaN(value) ? value : Number(value);
+        });
+      }
+      segmentInfo.cues.push(vttCue);
+    });
+  }
+
   /**
    * Uses the WebVTT parser to parse the segment response
    *
@@ -406,6 +443,14 @@ export default class VTTSegmentLoader extends SegmentLoader {
       throw new NoVttJsError();
     }
 
+    segmentInfo.cues = [];
+    segmentInfo.timestampmap = { MPEGTS: 0, LOCAL: 0 };
+
+    if (segmentInfo.mp4VttCues) {
+      this.parseMp4VttCues_(segmentInfo);
+      return;
+    }
+
     if (typeof window.TextDecoder === 'function') {
       decoder = new window.TextDecoder('utf8');
     } else {
@@ -419,9 +464,6 @@ export default class VTTSegmentLoader extends SegmentLoader {
       decoder
     );
 
-    segmentInfo.cues = [];
-    segmentInfo.timestampmap = { MPEGTS: 0, LOCAL: 0 };
-
     parser.oncue = segmentInfo.cues.push.bind(segmentInfo.cues);
     parser.ontimestampmap = (map) => {
       segmentInfo.timestampmap = map;

diff --git a/test/media-segment-request.test.js b/test/media-segment-request.test.js
@@ -21,7 +21,9 @@ import {
   mp4VideoInit,
   muxed as muxedSegment,
   webmVideo,
-  webmVideoInit
+  webmVideoInit,
+  mp4WebVttInit,
+  mp4WebVtt
 } from 'create-test-data!segments';
 // needed for plugin registration
 import '../src/videojs-http-streaming';
@@ -1863,3 +1865,84 @@ QUnit.test('can get emsg ID3 frames from fmp4 audio segment', function(assert) {
   // Simulate receiving the init segment after the media
   this.standardXHRResponse(initReq, mp4AudioInit());
 });
+
+QUnit.test('can get webvtt text from an fmp4 segment', function(assert) {
+  const done = assert.async();
+  // expected frame data
+  const expectedCues = [
+    {
+      cueText: '2024-10-16T05:13:50Z\nen # 864527815',
+      end: 1729055630.9,
+      settings: undefined,
+      start: 1729055630
+    },
+    {
+      cueText: '2024-10-16T05:13:51Z\nen # 864527815',
+      end: 1729055631.9,
+      settings: undefined,
+      start: 1729055631
+    }
+  ];
+  const transmuxer = new videojs.EventTarget();
+
+  transmuxer.postMessage = (event) => {
+    if (event.action === 'getMp4WebVttText') {
+      transmuxer.trigger({
+        type: 'message',
+        data: {
+          action: 'getMp4WebVttText',
+          data: event.data,
+          mp4VttCues: expectedCues
+        }
+      });
+    }
+
+    if (event.action === 'probeMp4Tracks') {
+      transmuxer.trigger({
+        type: 'message',
+        data: {
+          action: 'probeMp4Tracks',
+          data: event.data,
+          tracks: [{type: 'text', codec: 'wvtt'}]
+        }
+      });
+    }
+  };
+
+  mediaSegmentRequest({
+    xhr: this.xhr,
+    xhrOptions: this.xhrOptions,
+    decryptionWorker: this.mockDecrypter,
+    segment: {
+      transmuxer,
+      resolvedUri: 'mp4WebVtt.mp4',
+      map: {
+        resolvedUri: 'mp4WebVttInit.mp4'
+      },
+      isFmp4: true
+    },
+    progressFn: this.noop,
+    trackInfoFn: this.noop,
+    timingInfoFn: this.noop,
+    id3Fn: this.noop,
+    captionsFn: this.noop,
+    dataFn: this.noop,
+    doneFn: (_e, _s, result) => {
+      assert.equal(result.mp4VttCues.length, 2, 'there are 2 mp4VttCues');
+      assert.deepEqual(result.mp4VttCues, expectedCues, 'mp4VttCues are expected values');
+      transmuxer.off();
+      done();
+    },
+    triggerSegmentEventFn: this.noop
+  });
+  assert.equal(this.requests.length, 2, 'there are two requests');
+
+  const initReq = this.requests.shift();
+  const segmentReq = this.requests.shift();
+
+  assert.equal(initReq.uri, 'mp4WebVttInit.mp4', 'the first request is for the init segment');
+  assert.equal(segmentReq.uri, 'mp4WebVtt.mp4', 'the second request is for a segment');
+
+  this.standardXHRResponse(initReq, mp4WebVttInit());
+  this.standardXHRResponse(segmentReq, mp4WebVtt());
+});
diff --git a/test/segments/mp4WebVtt.mp4 b/test/segments/mp4WebVtt.mp4
diff --git a/test/segments/mp4WebVttInit.mp4 b/test/segments/mp4WebVttInit.mp4