Skip to content

Commit 9cf11fc

Browse files
ahmadsharif1facebook-github-bot
authored andcommitted
[torchcodec] Return subsequent frame if FFMPEG seeks past end of the frame (#178)
Summary: Pull Request resolved: #178 The way get_frame_displayed_at works is as follows: We call avformat_seek_file with: `min_ts=-inf ts=timestamp max_ts=timestamp` https://ffmpeg.org/doxygen/7.0/group__lavf__decoding.html#ga3b40fc8d2fda6992ae6ea2567d71ba30 This should in theory never seek past our timestamp. However for some videos ffmpeg does seek past our timestamp. And that causes us to never return a valid frame in that case. The fix is to warn the user and return the first frame after the timestamp if this ever happens. Differential Revision: D61139386
1 parent dbfef12 commit 9cf11fc

File tree

6 files changed

+58
-10
lines changed

6 files changed

+58
-10
lines changed

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -934,14 +934,27 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux(
934934
}
935935
}
936936
setCursorPtsInSeconds(seconds);
937-
return getDecodedOutputWithFilter(
938-
[seconds, this](int frameStreamIndex, AVFrame* frame) {
939-
StreamInfo& stream = streams_[frameStreamIndex];
940-
double frameStartTime = ptsToSeconds(frame->pts, stream.timeBase);
941-
double frameEndTime =
942-
ptsToSeconds(frame->pts + getDuration(frame), stream.timeBase);
943-
return seconds >= frameStartTime && seconds < frameEndTime;
944-
});
937+
return getDecodedOutputWithFilter([seconds, this](
938+
int frameStreamIndex, AVFrame* frame) {
939+
StreamInfo& stream = streams_[frameStreamIndex];
940+
double frameStartTime = ptsToSeconds(frame->pts, stream.timeBase);
941+
double frameEndTime =
942+
ptsToSeconds(frame->pts + getDuration(frame), stream.timeBase);
943+
if (frameStartTime > seconds) {
944+
// FFMPEG seeked past the frame we are looking for even though we
945+
// set max_ts to be our needed timestamp in avformat_seek_file()
946+
// in maybeSeekToBeforeDesiredPts().
947+
// This could be a bug in FFMPEG: https://trac.ffmpeg.org/ticket/11137
948+
// In this case we return the very next frame instead of throwing an
949+
// exception.
950+
std::cerr
951+
<< "TorchCodec: WARNING: ffmpeg seeked past requested timestamp="
952+
<< seconds << ". Returning frame with startTime=" << frameStartTime
953+
<< " endTime=" << frameEndTime << std::endl;
954+
return true;
955+
}
956+
return seconds >= frameStartTime && seconds < frameEndTime;
957+
});
945958
}
946959

947960
void VideoDecoder::validateUserProvidedStreamIndex(uint64_t streamIndex) {

test/decoders/test_simple_video_decoder.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99

1010
from torchcodec.decoders import _core, SimpleVideoDecoder
1111

12-
from ..utils import assert_tensor_close, assert_tensor_equal, NASA_VIDEO
12+
from ..utils import assert_tensor_close, assert_tensor_equal, H265_VIDEO, NASA_VIDEO
13+
1314

1415

1516
class TestSimpleDecoder:
@@ -320,6 +321,15 @@ def test_get_frame_displayed_at(self):
320321
assert isinstance(decoder.get_frame_displayed_at(6.02).pts_seconds, float)
321322
assert isinstance(decoder.get_frame_displayed_at(6.02).duration_seconds, float)
322323

324+
def test_get_frame_displayed_at_h265(self):
325+
decoder = SimpleVideoDecoder(H265_VIDEO.path)
326+
# Note that for H265, FFMPEG's seeking is not precise. Even though we ask to
327+
# seek with a max_ts=0.5, FFMPEG will seek beyond that point.
328+
# TODO: Check with the ffmpeg-devel mailing list and see if this can be fixed
329+
# upstream in ffmpeg.
330+
ref_frame6 = H265_VIDEO.get_frame_by_name("frame000006")
331+
assert_tensor_equal(ref_frame6, decoder.get_frame_displayed_at(0.5).data)
332+
323333
def test_get_frame_displayed_at_fails(self):
324334
decoder = SimpleVideoDecoder(NASA_VIDEO.path)
325335

test/generate_reference_resources.sh

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,19 @@ ffmpeg -y -i "$VIDEO_PATH" -b:a 192K -vn "$VIDEO_PATH.audio.mp3"
4242

4343
# TODO: Add frames decoded by Nvidia's NVDEC.
4444

45+
# This video was generated by running the following:
46+
# conda install -c conda-forge x265
47+
# ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265 --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
48+
# ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
49+
VIDEO_PATH=$RESOURCES_DIR/h265_video.mp4
50+
FRAMES=(6)
51+
for frame in "${FRAMES[@]}"; do
52+
frame_name=$(printf "%06d" "$frame")
53+
ffmpeg -y -i "$VIDEO_PATH" -vf select="eq(n\,$frame)" -vsync vfr -q:v 2 "$VIDEO_PATH.frame$frame_name.bmp"
54+
done
55+
4556
for bmp in "$RESOURCES_DIR"/*.bmp
4657
do
47-
python3 convert_image_to_tensor.py "$bmp"
58+
python3 $TORCHCODEC_PATH/test/convert_image_to_tensor.py "$bmp"
4859
rm -f "$bmp"
4960
done

test/resources/h265_video.mp4

17.3 KB
Binary file not shown.
49.3 KB
Binary file not shown.

test/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,17 @@ def empty_chw_tensor(self) -> torch.Tensor:
152152
# When we start actually decoding audio-only files, we'll probably need to define
153153
# a TestAudio class with audio specific values. Until then, we only need a filename.
154154
NASA_AUDIO = TestContainerFile(filename="nasa_13013.mp4.audio.mp3", frames={})
155+
156+
H265_VIDEO = TestVideo(
157+
filename="h265_video.mp4",
158+
height=128,
159+
width=128,
160+
num_color_channels=3,
161+
# TODO_OPEN_ISSUE Scott: improve the testing framework so that these values are loaded from a JSON
162+
# file and not hardcoded. These values were copied over by hand from the JSON
163+
# output from the following command:
164+
# $ ffprobe -v error -hide_banner -select_streams v:1 -show_frames -of json test/resources/h265_video.mp4 > out.json
165+
frames={
166+
6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1),
167+
},
168+
)

0 commit comments

Comments
 (0)