diff --git a/DALI_EXTRA_VERSION b/DALI_EXTRA_VERSION index df2ce3c6109..1c7b54f071c 100644 --- a/DALI_EXTRA_VERSION +++ b/DALI_EXTRA_VERSION @@ -1 +1 @@ -69ffed23b15233b583cb5a398e860a63863b2c99 +3a240e3b37b599975886ee65a87a2727ca98a107 diff --git a/dali/operators/reader/loader/video/frames_decoder.cc b/dali/operators/reader/loader/video/frames_decoder.cc index 6beb2966f3a..4cfa06972e8 100644 --- a/dali/operators/reader/loader/video/frames_decoder.cc +++ b/dali/operators/reader/loader/video/frames_decoder.cc @@ -129,7 +129,9 @@ bool FramesDecoder::CheckCodecSupport() { void FramesDecoder::FindVideoStream(bool init_codecs) { if (init_codecs) { - for (size_t i = 0; i < av_state_->ctx_->nb_streams; ++i) { + size_t i = 0; + + for (i = 0; i < av_state_->ctx_->nb_streams; ++i) { av_state_->codec_params_ = av_state_->ctx_->streams[i]->codecpar; av_state_->codec_ = avcodec_find_decoder(av_state_->codec_params_->codec_id); @@ -139,11 +141,12 @@ void FramesDecoder::FindVideoStream(bool init_codecs) { if (av_state_->codec_->type == AVMEDIA_TYPE_VIDEO) { av_state_->stream_id_ = i; - return; + break; } } - DALI_FAIL(make_string("Could not find a valid video stream in a file ", Filename())); + DALI_ENFORCE(i < av_state_->ctx_->nb_streams, + make_string("Could not find a valid video stream in a file ", Filename())); } else { av_state_->stream_id_ = av_find_best_stream(av_state_->ctx_, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); @@ -154,6 +157,10 @@ void FramesDecoder::FindVideoStream(bool init_codecs) { av_state_->codec_params_ = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar; } + if (Height() == 0 || Width() == 0) { + DALI_ENFORCE(avformat_find_stream_info(av_state_->ctx_, nullptr) >= 0); + DALI_ENFORCE(Height() != 0 && Width() != 0, "Couldn't load video size info."); + } } FramesDecoder::FramesDecoder(const std::string &filename) @@ -236,20 +243,83 @@ FramesDecoder::FramesDecoder(const char *memory_file, int memory_file_size, bool DetectVfr(); } +void FramesDecoder::CreateAvState(std::unique_ptr &av_state, bool init_codecs) { + av_state->ctx_ = avformat_alloc_context(); + DALI_ENFORCE(av_state_->ctx_, "Could not alloc avformat context"); + + uint8_t *av_io_buffer = static_cast(av_malloc(default_av_buffer_size)); + + AVIOContext *av_io_context = avio_alloc_context( + av_io_buffer, + default_av_buffer_size, + 0, + &memory_video_file_.value(), + detail::read_memory_video_file, + nullptr, + detail::seek_memory_video_file); + + av_state->ctx_->pb = av_io_context; + + int ret = avformat_open_input(&av_state->ctx_, "", nullptr, nullptr); + DALI_ENFORCE( + ret == 0, + make_string( + "Failed to open video file ", + Filename(), + "due to ", + detail::av_error_string(ret))); + av_state->stream_id_ = av_find_best_stream( + av_state->ctx_, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); + av_state->codec_params_ = av_state->ctx_->streams[av_state->stream_id_]->codecpar; + + av_state->codec_ctx_ = avcodec_alloc_context3(av_state->codec_); + DALI_ENFORCE(av_state->codec_ctx_, "Could not alloc av codec context"); + + ret = avcodec_parameters_to_context(av_state->codec_ctx_, av_state->codec_params_); + DALI_ENFORCE( + ret >= 0, + make_string("Could not fill the codec based on parameters: ", detail::av_error_string(ret))); + + av_state->packet_ = av_packet_alloc(); + DALI_ENFORCE(av_state->packet_, "Could not allocate av packet"); +} + void FramesDecoder::ParseNumFrames() { - int curr_num_frames = 0; - while (av_read_frame(av_state_->ctx_, av_state_->packet_) >= 0) { + if (IsFormatSeekable()) { + CountFrames(av_state_.get()); + Reset(); + } else { + // Failover for unseekable video + auto current_position = memory_video_file_->position_; + memory_video_file_->Seek(0, SEEK_SET); + std::unique_ptr tmp_av_state = std::make_unique(); + CreateAvState(tmp_av_state, false); + CountFrames(tmp_av_state.get()); + memory_video_file_->Seek(current_position, SEEK_SET); + } +} + +void FramesDecoder::CountFrames(AvState *av_state) { + num_frames_ = 0; + while (av_read_frame(av_state->ctx_, av_state->packet_) >= 0) { // We want to make sure that we call av_packet_unref in every iteration - auto packet = AVPacketScope(av_state_->packet_, av_packet_unref); + auto packet = AVPacketScope(av_state->packet_, av_packet_unref); - if (packet->stream_index != av_state_->stream_id_) { + if (packet->stream_index != av_state->stream_id_) { continue; } - curr_num_frames++; + ++num_frames_.value(); } +} - num_frames_ = curr_num_frames; - Reset(); +bool FramesDecoder::IsFormatSeekable() { + if ( + av_state_->ctx_->iformat->read_seek == nullptr && + av_state_->ctx_->iformat->read_seek2 == nullptr) { + return false; + } + + return av_state_->ctx_->pb->read_seek != nullptr; } void FramesDecoder::BuildIndex() { diff --git a/dali/operators/reader/loader/video/frames_decoder.h b/dali/operators/reader/loader/video/frames_decoder.h index de0514b7e5b..df2a7f1cb31 100644 --- a/dali/operators/reader/loader/video/frames_decoder.h +++ b/dali/operators/reader/loader/video/frames_decoder.h @@ -58,8 +58,10 @@ struct AvState { av_frame_free(&frame_); } avcodec_free_context(&codec_ctx_); - avformat_close_input(&ctx_); - avformat_free_context(ctx_); + if (ctx_ != nullptr) { + avformat_close_input(&ctx_); + avformat_free_context(ctx_); + } ctx_ = nullptr; codec_ = nullptr; @@ -207,6 +209,8 @@ class DLL_PUBLIC FramesDecoder { bool is_full_range_ = false; + std::optional zero_latency_ = {}; + private: /** * @brief Gets the packet from the decoder and reads a frame from it to provided buffer. Returns @@ -217,7 +221,7 @@ class DLL_PUBLIC FramesDecoder { * @param data Output buffer to copy data to. If `copy_to_output` is false, this value is ignored. * @param copy_to_output Whether copy the frame to provided output. * - * @returns True, if the read was succesful, or false, when all regular farmes were consumed. + * @returns True, if the read was succesful, or false, when all regular frames were consumed. * */ bool ReadRegularFrame(uint8_t *data, bool copy_to_output = true); @@ -249,6 +253,12 @@ class DLL_PUBLIC FramesDecoder { void ParseNumFrames(); + void CreateAvState(std::unique_ptr &av_state, bool init_codecs); + + bool IsFormatSeekable(); + + void CountFrames(AvState *av_state); + std::string Filename() { return filename_.has_value() ? filename_.value() : "memory file"; } diff --git a/dali/operators/reader/loader/video/frames_decoder_gpu.cc b/dali/operators/reader/loader/video/frames_decoder_gpu.cc index 1ca06b52ae7..1a1fcee0fc9 100644 --- a/dali/operators/reader/loader/video/frames_decoder_gpu.cc +++ b/dali/operators/reader/loader/video/frames_decoder_gpu.cc @@ -11,24 +11,19 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "dali/operators/reader/loader/video/frames_decoder_gpu.h" - #include #include - #include #include #include #include #include - #include "dali/core/error_handling.h" #include "dali/core/cuda_error.h" #include "dali/pipeline/data/backend.h" #include "dali/pipeline/data/tensor.h" #include "dali/operators/reader/loader/video/nvdecode/color_space.h" - namespace dali { namespace frame_dec_gpu_impl { @@ -85,22 +80,19 @@ class NVDECCache { static NVDECCache cache_inst; return cache_inst; } - NVDECLease GetDecoder(CUVIDEOFORMAT *video_format) { std::unique_lock lock(access_lock); auto codec_type = video_format->codec; - unsigned height = video_format->display_area.bottom - video_format->display_area.top; - unsigned width = video_format->display_area.right - video_format->display_area.left; + unsigned height = video_format->coded_height; + unsigned width = video_format->coded_width; auto num_decode_surfaces = video_format->min_num_decode_surfaces; auto chroma_format = video_format->chroma_format; auto bit_depth_luma_minus8 = video_format->bit_depth_luma_minus8; if (num_decode_surfaces == 0) num_decode_surfaces = 20; - auto range = dec_cache.equal_range(codec_type); - std::unordered_map::iterator best_match = range.second; for (auto it = range.first; it != range.second; ++it) { if (best_match == range.second && it->second.used == false) { @@ -124,20 +116,17 @@ class NVDECCache { best_match->second.used = true; lock.unlock(); CUVIDRECONFIGUREDECODERINFO reconfigParams = { 0 }; - reconfigParams.ulTargetWidth = reconfigParams.ulWidth = width; reconfigParams.ulTargetHeight = reconfigParams.ulHeight = height; reconfigParams.ulNumDecodeSurfaces = num_decode_surfaces; best_match->second.height = height; best_match->second.width = width; best_match->second.num_decode_surfaces = num_decode_surfaces; - CUDA_CALL(cuvidReconfigureDecoder(best_match->second.decoder, &reconfigParams)); return NVDECLease(best_match->second); } #endif lock.unlock(); - auto caps = CUVIDDECODECAPS{}; caps.eCodecType = codec_type; caps.eChromaFormat = chroma_format; @@ -187,8 +176,10 @@ class NVDECCache { #endif decoder_info.ulMaxHeight = max_height; decoder_info.ulMaxWidth = max_width; - decoder_info.ulTargetHeight = height; - decoder_info.ulTargetWidth = width; + decoder_info.ulTargetHeight = video_format->display_area.bottom - + video_format->display_area.top; + decoder_info.ulTargetWidth = video_format->display_area.right - + video_format->display_area.left; decoder_info.ulNumDecodeSurfaces = num_decode_surfaces; decoder_info.ulNumOutputSurfaces = 2; @@ -271,7 +262,13 @@ int process_video_sequence(void *user_data, CUVIDEOFORMAT *video_format) { int process_picture_decode(void *user_data, CUVIDPICPARAMS *picture_params) { FramesDecoderGpu *frames_decoder = static_cast(user_data); - return frames_decoder->ProcessPictureDecode(user_data, picture_params); + return frames_decoder->ProcessPictureDecode(picture_params); +} + +int handle_picture_display(void *user_data, CUVIDPARSERDISPINFO *picture_display_info) { + FramesDecoderGpu *frames_decoder = static_cast(user_data); + + return frames_decoder->HandlePictureDisplay(picture_display_info); } } // namespace frame_dec_gpu_impl @@ -282,10 +279,20 @@ void FramesDecoderGpu::InitBitStreamFilter() { const char* filtername = nullptr; switch (av_state_->codec_params_->codec_id) { case AVCodecID::AV_CODEC_ID_H264: - filtername = "h264_mp4toannexb"; + if (!strcmp(av_state_->ctx_->iformat->long_name, "QuickTime / MOV") || + !strcmp(av_state_->ctx_->iformat->long_name, "FLV (Flash Video)") || + !strcmp(av_state_->ctx_->iformat->long_name, "Matroska / WebM") || + !strcmp(av_state_->ctx_->iformat->long_name, "raw H.264 video")) { + filtername = "h264_mp4toannexb"; + } break; case AVCodecID::AV_CODEC_ID_HEVC: - filtername = "hevc_mp4toannexb"; + if (!strcmp(av_state_->ctx_->iformat->long_name, "QuickTime / MOV") || + !strcmp(av_state_->ctx_->iformat->long_name, "FLV (Flash Video)") || + !strcmp(av_state_->ctx_->iformat->long_name, "Matroska / WebM") || + !strcmp(av_state_->ctx_->iformat->long_name, "raw HEVC video")) { + filtername = "hevc_mp4toannexb"; + } break; case AVCodecID::AV_CODEC_ID_MPEG4: if (!strcmp(av_state_->ctx_->iformat->name, "avi")) { @@ -399,7 +406,7 @@ FramesDecoderGpu::FramesDecoderGpu( InitGpuParser(); } -int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *picture_params) { +int FramesDecoderGpu::ProcessPictureDecode(CUVIDPICPARAMS *picture_params) { // Sending empty packet will call this callback. // If we want to flush the decoder, we do not need to do anything here if (flush_) { @@ -407,12 +414,21 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict } CUDA_CALL(cuvidDecodePicture(nvdecode_state_->decoder, picture_params)); + CUVIDPARSERDISPINFO picture_display_info; + memset(&picture_display_info, 0, sizeof(picture_display_info)); + picture_display_info.picture_index = picture_params->CurrPicIdx; + picture_display_info.progressive_frame = !picture_params->field_pic_flag; + picture_display_info.top_field_first = picture_params->bottom_field_flag ^ 1; + HandlePictureDisplay(&picture_display_info); + + return 1; +} - // Process decoded frame for output +int FramesDecoderGpu::HandlePictureDisplay(CUVIDPARSERDISPINFO *picture_display_info) { CUVIDPROCPARAMS videoProcessingParameters = {}; - videoProcessingParameters.progressive_frame = !picture_params->field_pic_flag; + videoProcessingParameters.progressive_frame = !picture_display_info->progressive_frame; videoProcessingParameters.second_field = 1; - videoProcessingParameters.top_field_first = picture_params->bottom_field_flag ^ 1; + videoProcessingParameters.top_field_first = picture_display_info->top_field_first; videoProcessingParameters.unpaired_field = 0; videoProcessingParameters.output_stream = stream_; @@ -454,7 +470,7 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict CUDA_CALL(cuvidMapVideoFrame( nvdecode_state_->decoder, - picture_params->CurrPicIdx, + picture_display_info->picture_index, &frame, &pitch, &videoProcessingParameters)); @@ -529,7 +545,12 @@ bool FramesDecoderGpu::SendFrameToParser() { // Store pts from current packet to indicate, // that this frame is in the decoder - piped_pts_.push(av_state_->packet_->pts); + if (av_state_->packet_->pts != AV_NOPTS_VALUE) { + piped_pts_.push(av_state_->packet_->pts); + } else { + piped_pts_.push(frame_index_if_no_pts_); + frame_index_if_no_pts_++; + } // Add header needed for NVDECODE to the packet if (filtered_packet_->data) { @@ -557,35 +578,76 @@ bool FramesDecoderGpu::ReadNextFrameWithoutIndex(uint8_t *data, bool copy_to_out current_copy_to_output_ = copy_to_output; current_frame_output_ = data; + int frame_to_return_index = -1; + + // Handle the case, when packet has more frames that we have empty spots + // in the buffer. + // If so, we need to return frame from the buffer before sending last packet. + if (frame_index_if_no_pts_ != 0) { + if (NumEmptySpots() < (piped_pts_.size())) { + for (size_t i = 0; i < frame_buffer_.size(); ++i) { + if (frame_buffer_[i].pts_ == NextFrameIdx()) { + frame_to_return_index = i; + break; + } + } + } + } + // Initial fill of the buffer frame_returned_ = false; - while (HasEmptySlot() && more_frames_to_decode_ && !frame_returned_) { + while ( + HasEmptySlot() && + more_frames_to_decode_ && + !frame_returned_ && + frame_to_return_index == -1) { if (av_read_frame(av_state_->ctx_, av_state_->packet_) >= 0) { if (!SendFrameToParser()) { continue; } } else { - SendLastPacket(); - more_frames_to_decode_ = false; - } - } + // Handle the case, when last packet has more frames that we have empty spots + // in the buffer. + // If so, we need to return frame from the buffer before sending last packet. + if (frame_index_if_no_pts_ != 0) { + if (NumEmptySpots() < (piped_pts_.size())) { + for (size_t i = 0; i < frame_buffer_.size(); ++i) { + if (frame_buffer_[i].pts_ == NextFrameIdx()) { + frame_to_return_index = i; + break; + } + } + } + } - int frame_to_return_index = -1; - for (size_t i = 0; i < frame_buffer_.size(); ++i) { - if (frame_buffer_[i].pts_ != -1) { - frame_to_return_index = i; - break; + if (frame_to_return_index == -1) { + SendLastPacket(); + more_frames_to_decode_ = false; + } else { + break; + } } } - for (size_t i = 1; i < frame_buffer_.size(); ++i) { - if (frame_buffer_[i].pts_ != -1) { - if (frame_buffer_[frame_to_return_index].pts_ > frame_buffer_[i].pts_) { + if (frame_to_return_index == -1) { + for (size_t i = 0; i < frame_buffer_.size(); ++i) { + if (frame_buffer_[i].pts_ != -1) { frame_to_return_index = i; + break; + } + } + + for (size_t i = 1; i < frame_buffer_.size(); ++i) { + if (frame_buffer_[i].pts_ != -1) { + if (frame_buffer_[frame_to_return_index].pts_ > frame_buffer_[i].pts_) { + frame_to_return_index = i; + } } } } + // This has to be separate if statement, because condition + // might have changed in the previous one. if (frame_to_return_index == -1) { return true; } @@ -672,9 +734,21 @@ bool FramesDecoderGpu::IsBufferEmpty() const { return true; } +unsigned int FramesDecoderGpu::NumEmptySpots() const { + unsigned int num_empty = 0; + for (auto &frame : frame_buffer_) { + if (frame.pts_ == -1) { + num_empty++; + } + } + + return num_empty; +} + void FramesDecoderGpu::Reset() { SendLastPacket(true); more_frames_to_decode_ = true; + frame_index_if_no_pts_ = 0; FramesDecoder::Reset(); } diff --git a/dali/operators/reader/loader/video/frames_decoder_gpu.h b/dali/operators/reader/loader/video/frames_decoder_gpu.h index 66a384b44d9..179750951a7 100644 --- a/dali/operators/reader/loader/video/frames_decoder_gpu.h +++ b/dali/operators/reader/loader/video/frames_decoder_gpu.h @@ -156,7 +156,9 @@ class DLL_PUBLIC FramesDecoderGpu : public FramesDecoder { int NextFramePts() { return Index(NextFrameIdx()).pts; } - int ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *picture_params); + int ProcessPictureDecode(CUVIDPICPARAMS *picture_params); + + int HandlePictureDisplay(CUVIDPARSERDISPINFO *picture_display_info); FramesDecoderGpu(FramesDecoderGpu&&) = default; @@ -174,6 +176,9 @@ class DLL_PUBLIC FramesDecoderGpu : public FramesDecoder { bool flush_ = false; bool more_frames_to_decode_ = true; + // This is used to order the frames, if there is no pts + int frame_index_if_no_pts_ = 0; + AVBSFContext *bsfc_ = nullptr; AVPacket *filtered_packet_ = nullptr; @@ -205,6 +210,8 @@ class DLL_PUBLIC FramesDecoderGpu : public FramesDecoder { bool ReadNextFrameWithoutIndex(uint8_t *data, bool copy_to_output); bool SendFrameToParser(); + + unsigned int NumEmptySpots() const; }; } // namespace dali diff --git a/dali/operators/reader/loader/video/frames_decoder_test.cc b/dali/operators/reader/loader/video/frames_decoder_test.cc index 77ecad301c9..23f11cfeb83 100644 --- a/dali/operators/reader/loader/video/frames_decoder_test.cc +++ b/dali/operators/reader/loader/video/frames_decoder_test.cc @@ -31,8 +31,8 @@ namespace dali { class FramesDecoderTestBase : public VideoTestBase { public: - virtual void RunSequentialTest(FramesDecoder &decoder, TestVideo &ground_truth, - double eps = 1.0) { + virtual void RunSequentialForwardTest( + FramesDecoder &decoder, TestVideo &ground_truth, double eps = 1.0) { // Iterate through the whole video in order for (int i = 0; i < decoder.NumFrames(); ++i) { ASSERT_EQ(decoder.NextFrameIdx(), i); @@ -41,16 +41,16 @@ class FramesDecoderTestBase : public VideoTestBase { } ASSERT_EQ(decoder.NextFrameIdx(), -1); + } - decoder.Reset(); + virtual void RunSequentialTest( + FramesDecoder &decoder, TestVideo &ground_truth, double eps = 1.0) { + // Iterate through the whole video in order + RunSequentialForwardTest(decoder, ground_truth, eps); - for (int i = 0; i < decoder.NumFrames(); ++i) { - ASSERT_EQ(decoder.NextFrameIdx(), i); - decoder.ReadNextFrame(FrameData()); - AssertFrame(FrameData(), i, ground_truth, eps); - } + decoder.Reset(); - ASSERT_EQ(decoder.NextFrameIdx(), -1); + RunSequentialForwardTest(decoder, ground_truth, eps); } virtual void RunTest(FramesDecoder &decoder, TestVideo &ground_truth, double eps = 1.0) { @@ -400,4 +400,18 @@ TEST_F(FramesDecoderGpuTest, VfrFrameRateMpeg4MkvNoIndexNoFrameNum) { RunSequentialTest(decoder, vfr_videos_[1], 3.0); } +TEST_F(FramesDecoderGpuTest, RawH264) { + auto memory_video = MemoryVideo(cfr_raw_h264_videos_paths_[1]); + + FramesDecoderGpu decoder(memory_video.data(), memory_video.size(), 0, false); + RunSequentialForwardTest(decoder, cfr_videos_[1], 1.5); +} + +TEST_F(FramesDecoderGpuTest, RawH265) { + auto memory_video = MemoryVideo(cfr_raw_h264_videos_paths_[0]); + + FramesDecoderGpu decoder(memory_video.data(), memory_video.size(), 0, false); + RunSequentialForwardTest(decoder, cfr_videos_[0], 1.5); +} + } // namespace dali diff --git a/dali/operators/reader/loader/video/video_test_base.cc b/dali/operators/reader/loader/video/video_test_base.cc index 1bb731f69a4..1fc34ad0787 100644 --- a/dali/operators/reader/loader/video/video_test_base.cc +++ b/dali/operators/reader/loader/video/video_test_base.cc @@ -122,6 +122,14 @@ std::vector VideoTestBase::vfr_mpeg4_mkv_videos_paths_{ testing::dali_extra_path() + "/db/video/vfr/test_1_mpeg4.mkv", testing::dali_extra_path() + "/db/video/vfr/test_2_mpeg4.mkv"}; +std::vector VideoTestBase::cfr_raw_h264_videos_paths_{ + testing::dali_extra_path() + "/db/video/cfr/test_1.h264", + testing::dali_extra_path() + "/db/video/cfr/test_2.h264"}; + +std::vector VideoTestBase::cfr_raw_h265_videos_paths_{ + testing::dali_extra_path() + "/db/video/cfr/test_1.h265", + testing::dali_extra_path() + "/db/video/cfr/test_2.h265"}; + std::vector VideoTestBase::cfr_videos_; std::vector VideoTestBase::vfr_videos_; std::vector VideoTestBase::vfr_hevc_videos_; diff --git a/dali/operators/reader/loader/video/video_test_base.h b/dali/operators/reader/loader/video/video_test_base.h index 6d4cb2821a7..3fb23a891a3 100644 --- a/dali/operators/reader/loader/video/video_test_base.h +++ b/dali/operators/reader/loader/video/video_test_base.h @@ -127,6 +127,9 @@ class VideoTestBase : public ::testing::Test { static std::vector vfr_videos_; static std::vector vfr_hevc_videos_; + static std::vector cfr_raw_h264_videos_paths_; + static std::vector cfr_raw_h265_videos_paths_; + static void SetUpTestSuite(); void RunFailureTest(std::function body, std::string expected_error);