NVIDIA · awolant · Jun 17, 2024 · Dec 18, 2023 · Feb 8, 2024 · Feb 19, 2024
diff --git a/dali/test/python/test_dali_video_plugin_decoder.py b/dali/test/python/test_dali_video_plugin_decoder.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nvidia.dali import pipeline_def
+import nvidia.dali.types as types
+import nvidia.dali.fn as fn
+
+import cv2
+import numpy as np
+
+import nvidia.dali.fn.plugin.video as video
+
+from test_utils import get_dali_extra_path
+
+
+@pipeline_def(device_id=0, num_threads=4, batch_size=1)
+def video_decoder_pipeline(data_path):
+    data = fn.external_source(
+        source=lambda: [
+            np.fromfile(data_path, np.uint8),
+        ],
+        dtype=types.UINT8,
+    )
+    return video.decoder(data, device="mixed", end_frame=50)
+
+
+def run_video_decoding_test(test_file_path, frame_list_file_path, frames_directory_path):
+    pipeline = video_decoder_pipeline(test_file_path)
+    pipeline.build()
+
+    (output,) = pipeline.run()
+    frames = output.as_cpu().as_array()
+
+    with open(frame_list_file_path) as f:
+        frame_list = f.read().splitlines()
+
+    for i, frame in enumerate(frames[0]):
+        # Check if the frame is equal to the ground truth frame.
+        # Due to differences in how the decoding is implemented in
+        # different video codecs, we can't guarantee that the frames
+        # will be exactly the same. Main purpose of this test is to
+        # check if the decoding is working and we hit the correct frames.
+        ground_truth = cv2.imread(f"{frames_directory_path}/{frame_list[i]}")
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        if not np.average(frame - ground_truth) < 15:
+            assert False, f"Frame {i} is not equal"
+
+
+def test_cfr_h264_mp4_decoding():
+    run_video_decoding_test(
+        get_dali_extra_path() + "/db/video/cfr/test_1.mp4",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1/frames_list.txt",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1",
+    )
+
+
+def test_cfr_h264_raw_decoding():
+    run_video_decoding_test(
+        get_dali_extra_path() + "/db/video/cfr/test_1.h264",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1/frames_list.txt",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1",
+    )
+
+
+def test_cfr_h265_mp4_decoding():
+    run_video_decoding_test(
+        get_dali_extra_path() + "/db/video/cfr/test_1_hevc.mp4",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1/frames_list.txt",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1",
+    )
+
+
+def test_cfr_h265_raw_decoding():
+    run_video_decoding_test(
+        get_dali_extra_path() + "/db/video/cfr/test_1.h265",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1/frames_list.txt",
+        f"{get_dali_extra_path()}/db/video/cfr/frames_1",
+    )
+
+
+def test_vfr_h264_mp4_decoding():
+    run_video_decoding_test(
+        get_dali_extra_path() + "/db/video/vfr/test_1.mp4",
+        f"{get_dali_extra_path()}/db/video/vfr/frames_1/frames_list.txt",
+        f"{get_dali_extra_path()}/db/video/vfr/frames_1",
+    )
+
+
+def test_vfr_hevc_mp4_decoding():
+    run_video_decoding_test(
+        get_dali_extra_path() + "/db/video/vfr/test_1_hevc.mp4",
+        f"{get_dali_extra_path()}/db/video/vfr/frames_1_hevc/frames_list.txt",
+        f"{get_dali_extra_path()}/db/video/vfr/frames_1_hevc",
+    )
diff --git a/plugins/video/pkg_src/CMakeLists.txt b/plugins/video/pkg_src/CMakeLists.txt
@@ -55,6 +55,7 @@ add_subdirectory("${pynvvideocodec_SOURCE_DIR}/src/VideoCodecSDKUtils/")
 
 set(VIDEO_PLUGIN_SOURCES
     src/decoder/video_decoder_mixed.cc
+    src/decoder/color_space.cu
 )
 
 add_library(dali_${PLUGIN_NAME} SHARED ${VIDEO_PLUGIN_SOURCES})

diff --git a/plugins/video/pkg_src/src/decoder/color_space.cu b/plugins/video/pkg_src/src/decoder/color_space.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/color_space.h"
+
+#include <cuda_runtime.h>
+#include "dali/kernels/imgproc/sampler.h"
+#include "dali/kernels/imgproc/color_manipulation/color_space_conversion_impl.h"
+
+namespace dali {
+
+template <bool full_range>
+__global__ static void yuv_to_rgb_kernel(
+    const uint8_t *yuv, int yuv_pitch, uint8_t *rgb, int rgb_pitch, int width, int height) {
+    int halfx = (threadIdx.x + blockIdx.x * blockDim.x);
+    int halfy = (threadIdx.y + blockIdx.y * blockDim.y);
+    int x = 2 * halfx;
+    int y = 2 * halfy;
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    kernels::Surface2D<const uint8_t> Y_surf, UV_surf;
+    kernels::Surface2D<uint8_t> RGB;
+    const uint8_t *chroma = yuv + height * yuv_pitch;
+
+    Y_surf  = { yuv,    width,     height,     1, 1, yuv_pitch, 1 };
+    UV_surf = { chroma, width / 2, height / 2, 2, 2, yuv_pitch, 1 };
+
+    RGB = { rgb, width, height, 3, 3, rgb_pitch, 1 };
+
+    auto Y = kernels::make_sampler<DALI_INTERP_NN>(Y_surf);
+    auto UV = kernels::make_sampler<DALI_INTERP_LINEAR>(UV_surf);
+
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        float cy = halfy + i * 0.5f + 0.25f;
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            float cx = halfx + j * 0.5f + 0.25f;
+            u8vec3 yuv_val;
+            yuv_val[0] = Y.at(ivec2{x + j, y + i}, 0, kernels::BorderClamp());
+
+            UV(&yuv_val[1], vec2(cx, cy), kernels::BorderClamp());
+
+            u8vec3 rgb_val;
+            if (full_range)
+                rgb_val = dali::kernels::color::jpeg::ycbcr_to_rgb<uint8_t>(yuv_val);
+            else
+                rgb_val = dali::kernels::color::itu_r_bt_601::ycbcr_to_rgb<uint8_t>(yuv_val);
+
+            RGB({x + j, y + i, 0}) = rgb_val.x;
+            RGB({x + j, y + i, 1}) = rgb_val.y;
+            RGB({x + j, y + i, 2}) = rgb_val.z;
+        }
+    }
+}
+
+}  // namespace dali
+
+void yuv_to_rgb(uint8_t *yuv, int yuv_pitch, uint8_t *rgb, int rgb_pitch, int width, int height,
+                bool full_range, cudaStream_t stream) {
+    auto grid_layout = dim3((width + 63) / 32 / 2, (height + 3));
+    auto block_layout = dim3(32, 2);
+
+    if (full_range) {
+        dali::yuv_to_rgb_kernel<true>
+            <<<grid_layout, block_layout, 0, stream>>>
+            (yuv, yuv_pitch, rgb, rgb_pitch, width, height);
+    } else {
+        dali::yuv_to_rgb_kernel<false>
+            <<<grid_layout, block_layout, 0, stream>>>
+            (yuv, yuv_pitch, rgb, rgb_pitch, width, height);
+    }
+}
diff --git a/plugins/video/pkg_src/src/decoder/color_space.h b/plugins/video/pkg_src/src/decoder/color_space.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PLUGINS_VIDEO_PKG_SRC_SRC_DECODER_COLOR_SPACE_H_
+#define PLUGINS_VIDEO_PKG_SRC_SRC_DECODER_COLOR_SPACE_H_
+
+#include <stdint.h>
+
+void yuv_to_rgb(
+    uint8_t *yuv,
+    int yuv_pitch,
+    uint8_t *rgb,
+    int rgb_pitch,
+    int width,
+    int height,
+    bool full_range,
+    cudaStream_t stream);
+
+#endif  // PLUGINS_VIDEO_PKG_SRC_SRC_DECODER_COLOR_SPACE_H_
diff --git a/plugins/video/pkg_src/src/decoder/video_decoder_mixed.cc b/plugins/video/pkg_src/src/decoder/video_decoder_mixed.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,12 +15,14 @@
 #include "decoder/video_decoder_mixed.h"
 #include "dali/core/tensor_shape.h"
 
+#include "decoder/color_space.h"
+
 namespace dali_video {
 
 class MemoryVideoFile : public FFmpegDemuxer::DataProvider {
  public:
   MemoryVideoFile(const void *data, int64_t size)
-    : data_(static_cast<const uint8_t*>(data)), size_(size), position_(0) {}
+      : data_(static_cast<const uint8_t *>(data)), size_(size), position_(0) {}
 
 
   int GetData(uint8_t *buffer, int buffer_size) override {
@@ -41,8 +43,8 @@ class MemoryVideoFile : public FFmpegDemuxer::DataProvider {
   int64_t position_;
 };
 
-bool VideoDecoderMixed::SetupImpl(
-  std::vector<dali::OutputDesc> &output_desc, const dali::Workspace &ws) {
+bool VideoDecoderMixed::SetupImpl(std::vector<dali::OutputDesc> &output_desc,
+                                  const dali::Workspace &ws) {
   ValidateInput(ws);
   const auto &input = ws.Input<dali::CPUBackend>(0);
   int batch_size = input.num_samples();
@@ -51,13 +53,13 @@ bool VideoDecoderMixed::SetupImpl(
   samples_.resize(batch_size);
   dali::TensorListShape<> sh(batch_size, 4);
   for (int i = 0; i < batch_size; i++) {
-    auto& sample = samples_[i];
+    auto &sample = samples_[i];
     sample.data_provider_ =
         std::make_unique<MemoryVideoFile>(input.raw_tensor(i), input[i].shape().num_elements());
     sample.demuxer_ = std::make_unique<FFmpegDemuxer>(sample.data_provider_.get());
     sample.current_packet_ = std::make_unique<PacketData>();
-    sh.set_tensor_shape(
-        i, dali::TensorShape<>(10, sample.demuxer_->GetHeight(), sample.demuxer_->GetWidth(), 3));
+    sh.set_tensor_shape(i, dali::TensorShape<>(end_frame_, sample.demuxer_->GetHeight(),
+                                               sample.demuxer_->GetWidth(), 3));
   }
   output_desc.resize(1);
   output_desc[0].shape = sh;
@@ -69,47 +71,58 @@ void VideoDecoderMixed::Run(dali::Workspace &ws) {
   auto &output = ws.Output<dali::GPUBackend>(0);
   const auto &input = ws.Input<dali::CPUBackend>(0);
   int batch_size = input.num_samples();
-  int s = 0;
-
-  cuInit(0);
-  int nGpu = 0;
-  cuDeviceGetCount(&nGpu);
 
-  bool m_bDestroyContext = false;
   CUcontext cuContext = nullptr;
-  CUstream cuStream = nullptr;
-
-  for (int i = 0; i < batch_size; i++) {
-    cuCtxGetCurrent(&cuContext);
-    if (!cuContext) {
-      createCudaContext(&cuContext, device_id_, 0);
-      m_bDestroyContext = true;
-    }
-    cuCtxPopCurrent(&cuContext);
-  }
+  CUstream cuStream = ws.stream();
+  cuCtxGetCurrent(&cuContext);
 
-  if (!cuContext)
+  if (!cuContext) {
     throw std::runtime_error("Failed to create a cuda context");
-
-  cuCtxPushCurrent(cuContext);
-  cuStreamCreate(&cuStream, 0);
+  }
 
   for (int i = 0; i < batch_size; i++) {
+    auto output_sample = output[i];
+    uint8_t *output_data = output_sample.template mutable_data<uint8_t>();
+
     auto &sample = samples_[i];
     sample.decoder_ = std::make_unique<NvDecoder>(
         cuStream, cuContext, true, FFmpeg2NvCodecId(sample.demuxer_->GetVideoCodec()), false,
         false /*_enableasyncallocations*/, false);
 
 
-    uint8_t *pVideo = NULL;
-    int nVideoBytes = 0;
-    while (sample.demuxer_->Demux(&pVideo, &nVideoBytes)) {
-      if (nVideoBytes) {
-        auto vecTupFrame = sample.decoder_->Decode(pVideo, nVideoBytes);
+    uint8_t *pVideo = NULL, *pFrame = nullptr;
+    int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0;
+
+    int num_frames = 0;
+
+    do {
+      sample.demuxer_->Demux(&pVideo, &nVideoBytes);
+      nFrameReturned = sample.decoder_->Decode(pVideo, nVideoBytes);
+
+      for (int i = 0; i < nFrameReturned; i++) {
+        pFrame = sample.decoder_->GetFrame();
+
+        uint8_t *dpFrame = output_data + num_frames * sample.demuxer_->GetHeight() *
+                                             sample.demuxer_->GetWidth() * 3;
+        int nWidth = sample.decoder_->GetWidth();
+        int nPitch = sample.decoder_->GetWidth();
+        int iMatrix =
+            sample.decoder_->GetVideoFormatInfo().video_signal_description.matrix_coefficients;
+        bool full_range =
+            sample.decoder_->GetVideoFormatInfo().video_signal_description.video_full_range_flag;
+
+
+        yuv_to_rgb(pFrame, nPitch, reinterpret_cast<uint8_t *>(dpFrame),
+                   sample.decoder_->GetWidth() * 3, sample.decoder_->GetWidth(),
+                   sample.decoder_->GetHeight(), full_range, cuStream);
+
+        ++num_frames;
+        if (end_frame_ > 0 && num_frames >= end_frame_) {
+          break;
+        }
       }
-    }
+    } while (nVideoBytes);
   }
-  cuCtxPopCurrent(&cuContext);
 }
 
 
@@ -123,12 +136,13 @@ The video streams can be in most of the container file formats. FFmpeg is used t
     .NumInput(1)
     .NumOutput(1)
     .InputDox(0, "buffer", "TensorList", "Data buffer with a loaded video file.")
+    .AddOptionalArg("end_frame", R"code(Index of the end frame to be decoded.)code", 0)
     .AddOptionalArg("affine",
-    R"code(Applies only to the mixed backend type.
+                    R"code(Applies only to the mixed backend type.
 
 If set to True, each thread in the internal thread pool will be tied to a specific CPU core.
- Otherwise, the threads can be reassigned to any CPU core by the operating system.)code", true);
-
+ Otherwise, the threads can be reassigned to any CPU core by the operating system.)code",
+                    true);
 
 
 DALI_REGISTER_OPERATOR(plugin__video__Decoder, VideoDecoderMixed, dali::Mixed);