NVIDIA · awolant · Feb 18, 2022 · Feb 9, 2022 · Feb 9, 2022 · Feb 9, 2022
diff --git a/dali/operators/reader/CMakeLists.txt b/dali/operators/reader/CMakeLists.txt
@@ -32,12 +32,15 @@ list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/sequence_reader_op.c
 
 if (BUILD_FFMPEG)
   list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_cpu_op.cc")
-  list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_op_test.cc")
 endif()
 
 if (BUILD_NVDEC)
   list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_op.cc")
   list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_resize_op.cc")
+  list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_gpu_op.cc")
+  if (BUILD_TEST)
+    list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_op_test.cc")
+  endif()
 endif()
 
 if (BUILD_LIBTAR)

diff --git a/dali/operators/reader/loader/video/CMakeLists.txt b/dali/operators/reader/loader/video/CMakeLists.txt
@@ -16,6 +16,8 @@ if (BUILD_NVDEC)
   add_subdirectory(nvdecode)
   list(APPEND DALI_INST_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_gpu.h")
   list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_gpu.cc")
+  list(APPEND DALI_INST_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/video_loader_decoder_gpu.h")
+  list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_loader_decoder_gpu.cc")
   if (BUILD_TEST)
     list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_test.cc")
   endif()

diff --git a/dali/operators/reader/loader/video/frames_decoder.cc b/dali/operators/reader/loader/video/frames_decoder.cc
@@ -14,6 +14,7 @@
 
 #include "dali/operators/reader/loader/video/frames_decoder.h"
 #include <memory>
+#include <iomanip>
 #include "dali/core/error_handling.h"
 
 
@@ -173,13 +174,14 @@ bool FramesDecoder::ReadRegularFrame(uint8_t *data, bool copy_to_output) {
       break;
     }
 
+    LOG_LINE << "Read frame (ReadRegularFrame), index " << next_frame_idx_ << ", timestamp " <<
+      std::setw(5)  << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl;
     if (!copy_to_output) {
       ++next_frame_idx_;
       return true;
     }
 
     CopyToOutput(data);
-    LOG_LINE << "Read frame (ReadRegularFrame), timestamp " << av_state_->frame_->pts << std::endl;
     ++next_frame_idx_;
     return true;
   }
@@ -257,10 +259,13 @@ bool FramesDecoder::ReadFlushFrame(uint8_t *data, bool copy_to_output) {
 
   if (copy_to_output) {
     CopyToOutput(data);
-    LOG_LINE << "Read frame (ReadFlushFrame), timestamp " << av_state_->frame_->pts << std::endl;
   }
 
+  LOG_LINE << "Read frame (ReadFlushFrame), index " << next_frame_idx_ << " timestamp " <<
+    std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl;
   ++next_frame_idx_;
+
+  // TODO(awolant): Figure out how to handle this during index building
   if (next_frame_idx_ >= NumFrames()) {
     next_frame_idx_ = -1;
   }

diff --git a/dali/operators/reader/loader/video/frames_decoder.h b/dali/operators/reader/loader/video/frames_decoder.h
@@ -161,7 +161,7 @@ class DLL_PUBLIC FramesDecoder {
 
   std::vector<IndexEntry> index_;
 
-  int next_frame_idx_;
+  int next_frame_idx_ = 0;
 
  private:
    /**

diff --git a/dali/operators/reader/loader/video/frames_decoder_gpu.cc b/dali/operators/reader/loader/video/frames_decoder_gpu.cc
@@ -19,6 +19,7 @@
 
 #include <string>
 #include <memory>
+#include <iomanip>
 
 #include "dali/core/error_handling.h"
 #include "dali/core/cuda_utils.h"
@@ -125,6 +126,10 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict
   if (current_pts == NextFramePts()) {
     // Currently decoded frame is actually the one we wanted
     frame_returned_ = true;
+
+    LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " <<
+        std::setw(5) << current_pts << ", current copy " << current_copy_to_output_ << std::endl;
+
     if (current_copy_to_output_ == false) {
       return 1;
     }
@@ -155,12 +160,17 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict
     Width(),
     Height(),
     stream_);
+  // TODO(awolant): Alterantive is to copy the data to a buffer
+  // and then process it on the stream. Check, if this is faster, when
+  // the benchmark is ready.
+  CUDA_CALL(cudaStreamSynchronize(stream_));
   CUDA_CALL(cuvidUnmapVideoFrame(nvdecode_state_->decoder, frame));
 
   return 1;
 }
 
 void FramesDecoderGpu::SeekFrame(int frame_id) {
+  // TODO(awolant): This seek can be optimized - for consecutive frames not needed etc.
   SendLastPacket(true);
   FramesDecoder::SeekFrame(frame_id);
 }
@@ -177,6 +187,9 @@ bool FramesDecoderGpu::ReadNextFrame(uint8_t *data, bool copy_to_output) {
       if (copy_to_output) {
         copyD2D(data, frame.frame_.data(), FrameSize());
       }
+      LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " <<
+        std::setw(5) << frame.pts_ << ", current copy " << copy_to_output << std::endl;
+
       frame.pts_ = -1;
 
       ++next_frame_idx_;

diff --git a/dali/operators/reader/loader/video/frames_decoder_gpu.h b/dali/operators/reader/loader/video/frames_decoder_gpu.h
@@ -68,6 +68,8 @@ class DLL_PUBLIC FramesDecoderGpu : public FramesDecoder {
 
   int ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *picture_params);
 
+  FramesDecoderGpu(FramesDecoderGpu&&) = default;
+
   ~FramesDecoderGpu();
 
  private:

diff --git a/dali/operators/reader/loader/video/video_loader_decoder_gpu.cc b/dali/operators/reader/loader/video/video_loader_decoder_gpu.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/operators/reader/loader/video/video_loader_decoder_gpu.h"
+
+#include "dali/util/nvml.h"
+
+namespace dali {
+void VideoSampleGpu::Decode() {
+  TensorShape<4> shape = {
+    sequence_len_,
+    video_file_->Height(),
+    video_file_->Width(),
+    video_file_->Channels()};
+
+  data_.Resize(
+    shape,
+    DALIDataType::DALI_UINT8);
+
+  for (int i = 0; i < sequence_len_; ++i) {
+    int frame_id = span_->start_ + i * span_->stride_;
+    video_file_->SeekFrame(frame_id);
+    video_file_->ReadNextFrame(
+      static_cast<uint8_t *>(data_.raw_mutable_data()) + i * video_file_->FrameSize());
+  }
+}
+
+VideoLoaderDecoderGpu::~VideoLoaderDecoderGpu() {
+  CUDA_DTOR_CALL(cudaStreamDestroy(cuda_stream_));
+}
+
+cudaStream_t VideoLoaderDecoderGpu::GetCudaStream() {
+  #if NVML_ENABLED
+  {
+    nvml::Init();
+    static float driver_version = nvml::GetDriverVersion();
+    if (driver_version > 460 && driver_version < 470.21) {
+      DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
+      return 0;
+    }
+  }
+  #else
+  {
+    int driver_cuda_version = 0;
+    CUDA_CALL(cuDriverGetVersion(&driver_cuda_version));
+    if (driver_cuda_version >= 11030 && driver_cuda_version < 11040) {
+      DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
+      return 0;
+    }
+  }
+  #endif
+
+  // TODO(awolant): Check per decoder stream
+  cudaStream_t stream;
+  DeviceGuard dg(device_id_);
+  CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  return stream;
+}
+
+void VideoLoaderDecoderGpu::PrepareEmpty(VideoSampleGpu &sample) {
+  sample = {};
+}
+
+void VideoLoaderDecoderGpu::ReadSample(VideoSampleGpu &sample) {
+  auto &sample_span = sample_spans_[current_index_];
+
+  // Bind sample to the video and span, so it can be decoded later
+  sample.span_ = &sample_span;
+  sample.video_file_ = &video_files_[sample_span.video_idx_];
+  sample.sequence_len_ = sequence_len_;
+
+  if (has_labels_) {
+    sample.label_ = labels_[sample_span.video_idx_];
+  }
+
+  ++current_index_;
+  MoveToNextShard(current_index_);
+}
+
+Index VideoLoaderDecoderGpu::SizeImpl() {
+  return sample_spans_.size();
+}
+
+void VideoLoaderDecoderGpu::PrepareMetadataImpl() {
+  video_files_.reserve(filenames_.size());
+  for (auto &filename : filenames_) {
+    video_files_.emplace_back(filename, cuda_stream_);
+  }
+
+  for (size_t video_idx = 0; video_idx < video_files_.size(); ++video_idx) {
+    for (int start = 0;
+         start + stride_ * sequence_len_ <= video_files_[video_idx].NumFrames();
+         start += step_) {
+      sample_spans_.push_back(
+        VideoSampleDesc(start, start + stride_ * sequence_len_, stride_, video_idx));
+    }
+  }
+  if (shuffle_) {
+    // seeded with hardcoded value to get
+    // the same sequence on every shard
+    std::mt19937 g(kDaliDataloaderSeed);
+    std::shuffle(std::begin(sample_spans_), std::end(sample_spans_), g);
+  }
+
+  // set the initial index for each shard
+  Reset(true);
+}
+
+void VideoLoaderDecoderGpu::Reset(bool wrap_to_shard) {
+  current_index_ = wrap_to_shard ? start_index(shard_id_, num_shards_, SizeImpl()) : 0;
+}
+
+}  // namespace dali
diff --git a/dali/operators/reader/loader/video/video_loader_decoder_gpu.h b/dali/operators/reader/loader/video/video_loader_decoder_gpu.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_
+#define DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_
+
+#include <string>
+#include <vector>
+
+#include "dali/operators/reader/loader/loader.h"
+#include "dali/operators/reader/loader/video/video_loader_decoder_cpu.h"
+#include "dali/operators/reader/loader/video/frames_decoder_gpu.h"
+
+namespace dali {
+class VideoSampleGpu {
+ public:
+  void Decode();
+
+  FramesDecoderGpu *video_file_ = nullptr;
+  VideoSampleDesc *span_ = nullptr;
+  int sequence_len_ = 0;
+  Tensor<GPUBackend> data_;
+  int label_ = -1;
+};
+
+
+class VideoLoaderDecoderGpu : public Loader<GPUBackend, VideoSampleGpu> {
+ public:
+  explicit inline VideoLoaderDecoderGpu(const OpSpec &spec) :
+    Loader<GPUBackend, VideoSampleGpu>(spec),
+    filenames_(spec.GetRepeatedArgument<std::string>("filenames")),
+    sequence_len_(spec.GetArgument<int>("sequence_length")),
+    stride_(spec.GetArgument<int>("stride")),
+    step_(spec.GetArgument<int>("step")),
+    cuda_stream_(GetCudaStream()) {
+    if (step_ <= 0) {
+      step_ = stride_ * sequence_len_;
+    }
+    has_labels_ = spec.TryGetRepeatedArgument(labels_, "labels");
+  }
+
+  void ReadSample(VideoSampleGpu &sample) override;
+
+  void PrepareEmpty(VideoSampleGpu &sample) override;
+
+  ~VideoLoaderDecoderGpu();
+
+ protected:
+  Index SizeImpl() override;
+
+  void PrepareMetadataImpl() override;
+
+ private:
+  void Reset(bool wrap_to_shard) override;
+
+  cudaStream_t GetCudaStream();
+
+  std::vector<std::string> filenames_;
+  std::vector<int> labels_;
+  bool has_labels_ = false;
+  std::vector<FramesDecoderGpu> video_files_;
+  std::vector<VideoSampleDesc> sample_spans_;
+
+  Index current_index_ = 0;
+
+  int sequence_len_;
+  int stride_;
+  int step_;
+
+  cudaStream_t cuda_stream_;
+};
+
+}  // namespace dali
+
+#endif  // DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_