Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VideoReaderDecoder GPU #3668

Merged
merged 36 commits into from
Feb 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
f678967
Add proper destructors
awolant Feb 9, 2022
a5f3c59
Add proper packet unrefs
awolant Feb 9, 2022
39bc25d
Add av_packet scope
awolant Feb 9, 2022
3c27142
Add nvdecode state destructor
awolant Feb 9, 2022
1a40724
Fix
awolant Feb 9, 2022
fa51886
Fix double free
awolant Feb 9, 2022
d883e6d
Add VideoReaderDecoderOp GPU
awolant Feb 9, 2022
45028d1
Fix segfault in packet destruction
awolant Feb 9, 2022
cec57de
Merge branch 'fix_frames_decoder_destruction' into add_video_reader_d…
awolant Feb 9, 2022
18c3118
Fix lint
awolant Feb 9, 2022
519c034
Merge branch 'fix_frames_decoder_destruction' into add_video_reader_d…
awolant Feb 9, 2022
468a3ad
Merge remote-tracking branch 'nvidia/main' into add_video_reader_deco…
awolant Feb 10, 2022
00c214f
Fix linter
awolant Feb 10, 2022
14a9e07
More logging
awolant Feb 10, 2022
30a02d4
Fix indexing
awolant Feb 10, 2022
9aace8b
Fix indexing
awolant Feb 10, 2022
e50982c
Improve logging
awolant Feb 10, 2022
4e109cd
Make test not crash
awolant Feb 11, 2022
5b2cffb
Make test test something
awolant Feb 11, 2022
8a9fc63
Tmp saving frames
awolant Feb 11, 2022
2befc78
Add labels support
awolant Feb 11, 2022
db02426
Fix linter
awolant Feb 11, 2022
80cd022
Fix BUILD_NVDEC=OFF build
awolant Feb 12, 2022
3b0e759
Move copy to sample
awolant Feb 12, 2022
6020269
Add stream propagation from opeartor to decoder
awolant Feb 14, 2022
4ea6500
Fix review, move decoding to RunImpl
awolant Feb 14, 2022
189f3a3
Merge branch 'add_video_reader_decoder_gpu' of https://github.com/awo…
awolant Feb 14, 2022
b0ac5c7
Fix lint
awolant Feb 14, 2022
08ef040
Add layout support
awolant Feb 14, 2022
696fa54
Remove debug stuff
awolant Feb 14, 2022
000a94c
Move decoding to prefetch
awolant Feb 16, 2022
76c64ee
Add SetumImpl, fix review comments
awolant Feb 16, 2022
1994dda
Fix review comments
awolant Feb 16, 2022
493cd28
Add stream support
awolant Feb 17, 2022
bfd225e
Fix destructor
awolant Feb 17, 2022
11bc332
Fix review comments
awolant Feb 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dali/operators/reader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@ list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/sequence_reader_op.c

if (BUILD_FFMPEG)
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_cpu_op.cc")
list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_op_test.cc")
endif()

if (BUILD_NVDEC)
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_op.cc")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_resize_op.cc")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_gpu_op.cc")
if (BUILD_TEST)
list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_decoder_op_test.cc")
endif()
Comment on lines +40 to +43
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixes a build error that I discovered during implementing this task.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

endif()

if (BUILD_LIBTAR)
Expand Down
2 changes: 2 additions & 0 deletions dali/operators/reader/loader/video/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ if (BUILD_NVDEC)
add_subdirectory(nvdecode)
list(APPEND DALI_INST_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_gpu.h")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_gpu.cc")
list(APPEND DALI_INST_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/video_loader_decoder_gpu.h")
list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_loader_decoder_gpu.cc")
if (BUILD_TEST)
list(APPEND DALI_OPERATOR_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/frames_decoder_test.cc")
endif()
Expand Down
9 changes: 7 additions & 2 deletions dali/operators/reader/loader/video/frames_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "dali/operators/reader/loader/video/frames_decoder.h"
#include <memory>
#include <iomanip>
#include "dali/core/error_handling.h"


Expand Down Expand Up @@ -173,13 +174,14 @@ bool FramesDecoder::ReadRegularFrame(uint8_t *data, bool copy_to_output) {
break;
}

LOG_LINE << "Read frame (ReadRegularFrame), index " << next_frame_idx_ << ", timestamp " <<
std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl;
Comment on lines +177 to +178
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I improved logging in FramesDecoder during debugging things I'm adding in this task.

if (!copy_to_output) {
++next_frame_idx_;
return true;
}

CopyToOutput(data);
LOG_LINE << "Read frame (ReadRegularFrame), timestamp " << av_state_->frame_->pts << std::endl;
++next_frame_idx_;
return true;
}
Expand Down Expand Up @@ -257,10 +259,13 @@ bool FramesDecoder::ReadFlushFrame(uint8_t *data, bool copy_to_output) {

if (copy_to_output) {
CopyToOutput(data);
LOG_LINE << "Read frame (ReadFlushFrame), timestamp " << av_state_->frame_->pts << std::endl;
}

LOG_LINE << "Read frame (ReadFlushFrame), index " << next_frame_idx_ << " timestamp " <<
std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl;
Comment on lines +264 to +265
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I improved logging in FramesDecoder during debugging things I'm adding in this task.

++next_frame_idx_;

// TODO(awolant): Figure out how to handle this during index building
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something I discovered during this task. Will be handled in the future, as this is minor inconvenience.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So how this is handled now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was always handled through checking the return value of the decoding function.
The problem right now is that during the index build, we don't yet have number of frames. I used that to set the index of the current frame. So during the index building, index of the last frame is not set correctly. This shows in log.
After the index build we immediately call Reset and it sets the current frame index to 0, so everything is fine. For now, I wanted to left it as it is, because all of the ideas I have atm involve web of ifs or something like that. And the only consequence is that in the log for the index building we have index of the last frame -1 instead of whatever it should be. I'll figure this out later.

if (next_frame_idx_ >= NumFrames()) {
next_frame_idx_ = -1;
}
Expand Down
2 changes: 1 addition & 1 deletion dali/operators/reader/loader/video/frames_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class DLL_PUBLIC FramesDecoder {

std::vector<IndexEntry> index_;

int next_frame_idx_;
int next_frame_idx_ = 0;

private:
/**
Expand Down
13 changes: 13 additions & 0 deletions dali/operators/reader/loader/video/frames_decoder_gpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <string>
#include <memory>
#include <iomanip>

#include "dali/core/error_handling.h"
#include "dali/core/cuda_utils.h"
Expand Down Expand Up @@ -125,6 +126,10 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict
if (current_pts == NextFramePts()) {
// Currently decoded frame is actually the one we wanted
frame_returned_ = true;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " <<
std::setw(5) << current_pts << ", current copy " << current_copy_to_output_ << std::endl;
Comment on lines +130 to +131
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I improved logging in FramesDecoder during debugging things I'm adding in this task.


if (current_copy_to_output_ == false) {
return 1;
}
Expand Down Expand Up @@ -155,12 +160,17 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict
Width(),
Height(),
stream_);
// TODO(awolant): Alterantive is to copy the data to a buffer
// and then process it on the stream. Check, if this is faster, when
// the benchmark is ready.
CUDA_CALL(cudaStreamSynchronize(stream_));
Copy link
Contributor Author

@awolant awolant Feb 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though we pass stream_ do NVDEC it turns out it is necessary to synchronize the stream before we unmap the frame.
It is not a big problem as we do not do anything else on it and synchronize it immediately after Run.
When we have proper benchmark (DALI-2594) we can test the alternative approach and pick the better one.

CUDA_CALL(cuvidUnmapVideoFrame(nvdecode_state_->decoder, frame));

return 1;
}

void FramesDecoderGpu::SeekFrame(int frame_id) {
// TODO(awolant): This seek can be optimized - for consecutive frames not needed etc.
SendLastPacket(true);
FramesDecoder::SeekFrame(frame_id);
}
Expand All @@ -177,6 +187,9 @@ bool FramesDecoderGpu::ReadNextFrame(uint8_t *data, bool copy_to_output) {
if (copy_to_output) {
copyD2D(data, frame.frame_.data(), FrameSize());
}
LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " <<
std::setw(5) << frame.pts_ << ", current copy " << copy_to_output << std::endl;

frame.pts_ = -1;

++next_frame_idx_;
Expand Down
2 changes: 2 additions & 0 deletions dali/operators/reader/loader/video/frames_decoder_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class DLL_PUBLIC FramesDecoderGpu : public FramesDecoder {

int ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *picture_params);

FramesDecoderGpu(FramesDecoderGpu&&) = default;

~FramesDecoderGpu();

private:
Expand Down
124 changes: 124 additions & 0 deletions dali/operators/reader/loader/video/video_loader_decoder_gpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dali/operators/reader/loader/video/video_loader_decoder_gpu.h"

#include "dali/util/nvml.h"

namespace dali {
void VideoSampleGpu::Decode() {
TensorShape<4> shape = {
sequence_len_,
video_file_->Height(),
video_file_->Width(),
video_file_->Channels()};

data_.Resize(
shape,
DALIDataType::DALI_UINT8);

for (int i = 0; i < sequence_len_; ++i) {
int frame_id = span_->start_ + i * span_->stride_;
video_file_->SeekFrame(frame_id);
video_file_->ReadNextFrame(
static_cast<uint8_t *>(data_.raw_mutable_data()) + i * video_file_->FrameSize());
}
}

VideoLoaderDecoderGpu::~VideoLoaderDecoderGpu() {
CUDA_DTOR_CALL(cudaStreamDestroy(cuda_stream_));
}

cudaStream_t VideoLoaderDecoderGpu::GetCudaStream() {
#if NVML_ENABLED
{
nvml::Init();
static float driver_version = nvml::GetDriverVersion();
if (driver_version > 460 && driver_version < 470.21) {
DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
return 0;
}
}
#else
{
int driver_cuda_version = 0;
CUDA_CALL(cuDriverGetVersion(&driver_cuda_version));
if (driver_cuda_version >= 11030 && driver_cuda_version < 11040) {
DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected.");
return 0;
}
}
#endif

// TODO(awolant): Check per decoder stream
cudaStream_t stream;
DeviceGuard dg(device_id_);
CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using CUDAStream, or even better, just lease one from the pool:

dali::CUDAStreamPool::instance().Get(device_id_);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't use CUDAStream as this is derived from UniqueHandle and I want to share this stream between decoders for now.

return stream;
}

void VideoLoaderDecoderGpu::PrepareEmpty(VideoSampleGpu &sample) {
sample = {};
}

void VideoLoaderDecoderGpu::ReadSample(VideoSampleGpu &sample) {
auto &sample_span = sample_spans_[current_index_];

// Bind sample to the video and span, so it can be decoded later
sample.span_ = &sample_span;
sample.video_file_ = &video_files_[sample_span.video_idx_];
sample.sequence_len_ = sequence_len_;

if (has_labels_) {
sample.label_ = labels_[sample_span.video_idx_];
}
Comment on lines +83 to +85
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it's not possible with current code, but what if this "sample" had a label in a previous iteration? we wouldn't be clearing it. How about:

sample.label_ = has_labels_ ? labels_[sample_span.video_idx_] : NO_LABEL;

being no-label a default value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added label_ = -1 in the class, so I don't think we need it here.


++current_index_;
MoveToNextShard(current_index_);
}

Index VideoLoaderDecoderGpu::SizeImpl() {
return sample_spans_.size();
}

void VideoLoaderDecoderGpu::PrepareMetadataImpl() {
video_files_.reserve(filenames_.size());
Copy link
Contributor

@JanuszL JanuszL Feb 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. So we have input files number amount of FramesDecoderGpu instances (including decoder instances inside).
I'm not sure how many of them we can have in parallel.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Solving this properly is part of DALI-2321 to be done when we have benchmark (DALI-2594). Before it is hard too tell anything about performance impact of any possible solution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is not about the perf, rather about resource constrains. I think creating 1000 decoders and parsers will consume a lot of resources.
Also we have already hit a maximum amount of files opened in parallel in the old VideoReader (libaviutil).

for (auto &filename : filenames_) {
video_files_.emplace_back(filename, cuda_stream_);
}

for (size_t video_idx = 0; video_idx < video_files_.size(); ++video_idx) {
for (int start = 0;
start + stride_ * sequence_len_ <= video_files_[video_idx].NumFrames();
start += step_) {
sample_spans_.push_back(
VideoSampleDesc(start, start + stride_ * sequence_len_, stride_, video_idx));
}
}
if (shuffle_) {
// seeded with hardcoded value to get
// the same sequence on every shard
std::mt19937 g(kDaliDataloaderSeed);
std::shuffle(std::begin(sample_spans_), std::end(sample_spans_), g);
}

// set the initial index for each shard
Reset(true);
}

void VideoLoaderDecoderGpu::Reset(bool wrap_to_shard) {
current_index_ = wrap_to_shard ? start_index(shard_id_, num_shards_, SizeImpl()) : 0;
}

} // namespace dali
86 changes: 86 additions & 0 deletions dali/operators/reader/loader/video/video_loader_decoder_gpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_
#define DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_

#include <string>
#include <vector>

#include "dali/operators/reader/loader/loader.h"
#include "dali/operators/reader/loader/video/video_loader_decoder_cpu.h"
#include "dali/operators/reader/loader/video/frames_decoder_gpu.h"

namespace dali {
class VideoSampleGpu {
public:
void Decode();

FramesDecoderGpu *video_file_ = nullptr;
VideoSampleDesc *span_ = nullptr;
int sequence_len_ = 0;
Tensor<GPUBackend> data_;
int label_ = -1;
};


class VideoLoaderDecoderGpu : public Loader<GPUBackend, VideoSampleGpu> {
public:
explicit inline VideoLoaderDecoderGpu(const OpSpec &spec) :
Loader<GPUBackend, VideoSampleGpu>(spec),
filenames_(spec.GetRepeatedArgument<std::string>("filenames")),
sequence_len_(spec.GetArgument<int>("sequence_length")),
stride_(spec.GetArgument<int>("stride")),
step_(spec.GetArgument<int>("step")),
cuda_stream_(GetCudaStream()) {
if (step_ <= 0) {
step_ = stride_ * sequence_len_;
}
has_labels_ = spec.TryGetRepeatedArgument(labels_, "labels");
}

void ReadSample(VideoSampleGpu &sample) override;

void PrepareEmpty(VideoSampleGpu &sample) override;

~VideoLoaderDecoderGpu();

protected:
Index SizeImpl() override;

void PrepareMetadataImpl() override;

private:
void Reset(bool wrap_to_shard) override;

cudaStream_t GetCudaStream();

std::vector<std::string> filenames_;
std::vector<int> labels_;
bool has_labels_ = false;
std::vector<FramesDecoderGpu> video_files_;
std::vector<VideoSampleDesc> sample_spans_;

Index current_index_ = 0;

int sequence_len_;
int stride_;
int step_;

cudaStream_t cuda_stream_;
};

} // namespace dali

#endif // DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_
Loading