[torchcodec] Return subsequent frame if FFMPEG seeks past end of the frame (#178)

ahmadsharif1 · facebook-github-bot · commit 9cf11fc6b76a · 2024-08-12T14:11:11.000-07:00
Summary: Pull Request resolved: #178 The way get_frame_displayed_at works is as follows: We call avformat_seek_file with: `min_ts=-inf ts=timestamp max_ts=timestamp` https://ffmpeg.org/doxygen/7.0/group__lavf__decoding.html#ga3b40fc8d2fda6992ae6ea2567d71ba30 This should in theory never seek past our timestamp. However for some videos ffmpeg does seek past our timestamp. And that causes us to never return a valid frame in that case. The fix is to warn the user and return the first frame after the timestamp if this ever happens. Differential Revision: D61139386
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -934,14 +934,27 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux(
     }
   }
   setCursorPtsInSeconds(seconds);
-  return getDecodedOutputWithFilter(
-      [seconds, this](int frameStreamIndex, AVFrame* frame) {
-        StreamInfo& stream = streams_[frameStreamIndex];
-        double frameStartTime = ptsToSeconds(frame->pts, stream.timeBase);
-        double frameEndTime =
-            ptsToSeconds(frame->pts + getDuration(frame), stream.timeBase);
-        return seconds >= frameStartTime && seconds < frameEndTime;
-      });
+  return getDecodedOutputWithFilter([seconds, this](
+                                        int frameStreamIndex, AVFrame* frame) {
+    StreamInfo& stream = streams_[frameStreamIndex];
+    double frameStartTime = ptsToSeconds(frame->pts, stream.timeBase);
+    double frameEndTime =
+        ptsToSeconds(frame->pts + getDuration(frame), stream.timeBase);
+    if (frameStartTime > seconds) {
+      // FFMPEG seeked past the frame we are looking for even though we
+      // set max_ts to be our needed timestamp in avformat_seek_file()
+      // in maybeSeekToBeforeDesiredPts().
+      // This could be a bug in FFMPEG: https://trac.ffmpeg.org/ticket/11137
+      // In this case we return the very next frame instead of throwing an
+      // exception.
+      std::cerr
+          << "TorchCodec: WARNING: ffmpeg seeked past requested timestamp="
+          << seconds << ". Returning frame with startTime=" << frameStartTime
+          << " endTime=" << frameEndTime << std::endl;
+      return true;
+    }
+    return seconds >= frameStartTime && seconds < frameEndTime;
+  });
 }
 
 void VideoDecoder::validateUserProvidedStreamIndex(uint64_t streamIndex) {
diff --git a/test/decoders/test_simple_video_decoder.py b/test/decoders/test_simple_video_decoder.py
@@ -9,7 +9,8 @@
 
 from torchcodec.decoders import _core, SimpleVideoDecoder
 
-from ..utils import assert_tensor_close, assert_tensor_equal, NASA_VIDEO
+from ..utils import assert_tensor_close, assert_tensor_equal, H265_VIDEO, NASA_VIDEO
+
 
 
 class TestSimpleDecoder:
@@ -320,6 +321,15 @@ def test_get_frame_displayed_at(self):
         assert isinstance(decoder.get_frame_displayed_at(6.02).pts_seconds, float)
         assert isinstance(decoder.get_frame_displayed_at(6.02).duration_seconds, float)
 
+    def test_get_frame_displayed_at_h265(self):
+        decoder = SimpleVideoDecoder(H265_VIDEO.path)
+        # Note that for H265, FFMPEG's seeking is not precise. Even though we ask to
+        # seek with a max_ts=0.5, FFMPEG will seek beyond that point.
+        # TODO: Check with the ffmpeg-devel mailing list and see if this can be fixed
+        # upstream in ffmpeg.
+        ref_frame6 = H265_VIDEO.get_frame_by_name("frame000006")
+        assert_tensor_equal(ref_frame6, decoder.get_frame_displayed_at(0.5).data)
+
     def test_get_frame_displayed_at_fails(self):
         decoder = SimpleVideoDecoder(NASA_VIDEO.path)
 
diff --git a/test/generate_reference_resources.sh b/test/generate_reference_resources.sh
@@ -42,8 +42,19 @@ ffmpeg -y -i "$VIDEO_PATH" -b:a 192K -vn "$VIDEO_PATH.audio.mp3"
 
 # TODO: Add frames decoded by Nvidia's NVDEC.
 
+# This video was generated by running the following:
+# conda install -c conda-forge x265
+# ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
+# ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
+VIDEO_PATH=$RESOURCES_DIR/h265_video.mp4
+FRAMES=(6)
+for frame in "${FRAMES[@]}"; do
+  frame_name=$(printf "%06d" "$frame")
+  ffmpeg -y -i "$VIDEO_PATH" -vf select="eq(n\,$frame)" -vsync vfr -q:v 2 "$VIDEO_PATH.frame$frame_name.bmp"
+done
+
 for bmp in "$RESOURCES_DIR"/*.bmp
 do
-  python3 convert_image_to_tensor.py "$bmp"
+  python3 $TORCHCODEC_PATH/test/convert_image_to_tensor.py "$bmp"
   rm -f "$bmp"
 done
diff --git a/test/resources/h265_video.mp4 b/test/resources/h265_video.mp4
diff --git a/test/resources/h265_video.mp4.frame000006.pt b/test/resources/h265_video.mp4.frame000006.pt
diff --git a/test/utils.py b/test/utils.py
@@ -152,3 +152,17 @@ def empty_chw_tensor(self) -> torch.Tensor:
 # When we start actually decoding audio-only files, we'll probably need to define
 # a TestAudio class with audio specific values. Until then, we only need a filename.
 NASA_AUDIO = TestContainerFile(filename="nasa_13013.mp4.audio.mp3", frames={})
+
+H265_VIDEO = TestVideo(
+    filename="h265_video.mp4",
+    height=128,
+    width=128,
+    num_color_channels=3,
+    # TODO_OPEN_ISSUE Scott: improve the testing framework so that these values are loaded from a JSON
+    # file and not hardcoded. These values were copied over by hand from the JSON
+    # output from the following command:
+    #  $ ffprobe -v error -hide_banner -select_streams v:1 -show_frames -of json test/resources/h265_video.mp4 > out.json
+    frames={
+        6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1),
+    },
+)