NVIDIA · awolant · Dec 13, 2022 · Nov 29, 2022 · Nov 30, 2022 · Nov 30, 2022
diff --git a/DALI_EXTRA_VERSION b/DALI_EXTRA_VERSION
@@ -1 +1 @@
-69ffed23b15233b583cb5a398e860a63863b2c99
+Update after merging https://github.com/NVIDIA/DALI_extra/pull/116
diff --git a/dali/operators/reader/loader/video/frames_decoder.cc b/dali/operators/reader/loader/video/frames_decoder.cc
@@ -129,7 +129,9 @@ bool FramesDecoder::CheckCodecSupport() {
 
 void FramesDecoder::FindVideoStream(bool init_codecs) {
   if (init_codecs) {
-    for (size_t i = 0; i < av_state_->ctx_->nb_streams; ++i) {
+    size_t i = 0;
+
+    for (i = 0; i < av_state_->ctx_->nb_streams; ++i) {
       av_state_->codec_params_ = av_state_->ctx_->streams[i]->codecpar;
       av_state_->codec_ = avcodec_find_decoder(av_state_->codec_params_->codec_id);
 
@@ -139,11 +141,12 @@ void FramesDecoder::FindVideoStream(bool init_codecs) {
 
       if (av_state_->codec_->type == AVMEDIA_TYPE_VIDEO) {
         av_state_->stream_id_ = i;
-        return;
+        break;
       }
     }
 
-    DALI_FAIL(make_string("Could not find a valid video stream in a file ", Filename()));
+    DALI_ENFORCE(i < av_state_->ctx_->nb_streams,
+                 make_string("Could not find a valid video stream in a file ", Filename()));
   } else {
     av_state_->stream_id_ = av_find_best_stream(av_state_->ctx_, AVMEDIA_TYPE_VIDEO,
                                                 -1, -1, nullptr, 0);
@@ -154,6 +157,10 @@ void FramesDecoder::FindVideoStream(bool init_codecs) {
 
     av_state_->codec_params_ = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar;
   }
+  if (Height() == 0 || Width() == 0) {
+    DALI_ENFORCE(avformat_find_stream_info(av_state_->ctx_, nullptr) >= 0);
+    DALI_ENFORCE(Height() != 0 && Width() != 0, "Couldn't load video size info.");
+  }
 }
 
 FramesDecoder::FramesDecoder(const std::string &filename)
@@ -236,20 +243,97 @@ FramesDecoder::FramesDecoder(const char *memory_file, int memory_file_size, bool
   DetectVfr();
 }
 
+void FramesDecoder::CreateAvState(std::unique_ptr<AvState> &av_state, bool init_codecs) {
+    av_state->ctx_ = avformat_alloc_context();
+    DALI_ENFORCE(av_state_->ctx_, "Could not alloc avformat context");
+
+    uint8_t *av_io_buffer = static_cast<uint8_t *>(av_malloc(default_av_buffer_size));
+
+    AVIOContext *av_io_context = avio_alloc_context(
+      av_io_buffer,
+      default_av_buffer_size,
+      0,
+      &memory_video_file_.value(),
+      detail::read_memory_video_file,
+      nullptr,
+      detail::seek_memory_video_file);
+
+    av_state->ctx_->pb = av_io_context;
+
+    int ret = avformat_open_input(&av_state->ctx_, "", nullptr, nullptr);
+    DALI_ENFORCE(
+      ret == 0,
+      make_string(
+        "Failed to open video file ",
+        Filename(),
+        "due to ",
+        detail::av_error_string(ret)));
+    av_state->stream_id_ = av_find_best_stream(
+      av_state->ctx_, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
+    av_state->codec_params_ = av_state->ctx_->streams[av_state->stream_id_]->codecpar;
+
+    av_state->codec_ctx_ = avcodec_alloc_context3(av_state->codec_);
+    DALI_ENFORCE(av_state->codec_ctx_, "Could not alloc av codec context");
+
+    ret = avcodec_parameters_to_context(av_state->codec_ctx_, av_state->codec_params_);
+    DALI_ENFORCE(
+      ret >= 0,
+      make_string("Could not fill the codec based on parameters: ", detail::av_error_string(ret)));
+
+    av_state->packet_ = av_packet_alloc();
+    DALI_ENFORCE(av_state->packet_, "Could not allocate av packet");
+}
+
 void FramesDecoder::ParseNumFrames() {
   int curr_num_frames = 0;
-  while (av_read_frame(av_state_->ctx_, av_state_->packet_) >= 0) {
-    // We want to make sure that we call av_packet_unref in every iteration
-    auto packet = AVPacketScope(av_state_->packet_, av_packet_unref);
 
-    if (packet->stream_index != av_state_->stream_id_) {
-      continue;
+  if (IsFormatSeekable()) {
+    while (av_read_frame(av_state_->ctx_, av_state_->packet_) >= 0) {
+      // We want to make sure that we call av_packet_unref in every iteration
+      auto packet = AVPacketScope(av_state_->packet_, av_packet_unref);
+
+      if (packet->stream_index != av_state_->stream_id_) {
+        continue;
+      }
+      curr_num_frames++;
+    }
+
+    num_frames_ = curr_num_frames;
+    Reset();
+  } else {
+    // Failover for unseekable video
+    auto current_position = memory_video_file_->position_;
+    memory_video_file_->Seek(0, SEEK_SET);
+    std::unique_ptr<AvState> tmp_av_state = std::make_unique<AvState>();
+    CreateAvState(tmp_av_state, false);
+
+    while (av_read_frame(tmp_av_state->ctx_, tmp_av_state->packet_) >= 0) {
+      // We want to make sure that we call av_packet_unref in every iteration
+      auto packet = AVPacketScope(tmp_av_state->packet_, av_packet_unref);
+
+      if (packet->stream_index != tmp_av_state->stream_id_) {
+        continue;
+      }
+      curr_num_frames++;
+    }
+
+    num_frames_ = curr_num_frames;
+    memory_video_file_->Seek(current_position, SEEK_SET);
+
+    if (tmp_av_state->packet_->pts == AV_NOPTS_VALUE) {
+      // zero_latency_ = false;
     }
-    curr_num_frames++;
   }
+}
 
-  num_frames_ = curr_num_frames;
-  Reset();
+bool FramesDecoder::IsFormatSeekable() {
+  if (
+    av_state_->ctx_->iformat->read_seek == nullptr &&
+    av_state_->ctx_->iformat->read_seek2 == nullptr) {
+    return false;
+  }
+
+  return av_state_->ctx_->pb->read_seek != nullptr;
 }
 
 void FramesDecoder::BuildIndex() {

diff --git a/dali/operators/reader/loader/video/frames_decoder.h b/dali/operators/reader/loader/video/frames_decoder.h
@@ -58,8 +58,13 @@ struct AvState {
       av_frame_free(&frame_);
     }
     avcodec_free_context(&codec_ctx_);
-    avformat_close_input(&ctx_);
-    avformat_free_context(ctx_);
+    if (ctx_ != nullptr) {
+      if (ctx_->pb != nullptr) {
+        avio_context_free(&ctx_->pb);
+      }
+      avformat_close_input(&ctx_);
+      avformat_free_context(ctx_);
+    }
 
     ctx_ = nullptr;
     codec_ = nullptr;
@@ -207,6 +212,8 @@ class DLL_PUBLIC FramesDecoder {
 
   bool is_full_range_ = false;
 
+  std::optional<bool> zero_latency_ = {};
+
  private:
    /**
    * @brief Gets the packet from the decoder and reads a frame from it to provided buffer. Returns
@@ -249,6 +256,10 @@ class DLL_PUBLIC FramesDecoder {
 
   void ParseNumFrames();
 
+  void CreateAvState(std::unique_ptr<AvState> &av_state, bool init_codecs);
+
+  bool IsFormatSeekable();
+
   std::string Filename() {
     return filename_.has_value() ? filename_.value() : "memory file";
   }