-
Notifications
You must be signed in to change notification settings - Fork 630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add VideoReaderDecoder GPU #3668
Changes from all commits
f678967
a5f3c59
39bc25d
3c27142
1a40724
fa51886
d883e6d
45028d1
cec57de
18c3118
519c034
468a3ad
00c214f
14a9e07
30a02d4
9aace8b
e50982c
4e109cd
5b2cffb
8a9fc63
2befc78
db02426
80cd022
3b0e759
6020269
4ea6500
189f3a3
b0ac5c7
08ef040
696fa54
000a94c
76c64ee
1994dda
493cd28
bfd225e
11bc332
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
|
||
#include "dali/operators/reader/loader/video/frames_decoder.h" | ||
#include <memory> | ||
#include <iomanip> | ||
#include "dali/core/error_handling.h" | ||
|
||
|
||
|
@@ -173,13 +174,14 @@ bool FramesDecoder::ReadRegularFrame(uint8_t *data, bool copy_to_output) { | |
break; | ||
} | ||
|
||
LOG_LINE << "Read frame (ReadRegularFrame), index " << next_frame_idx_ << ", timestamp " << | ||
std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl; | ||
Comment on lines
+177
to
+178
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I improved logging in |
||
if (!copy_to_output) { | ||
++next_frame_idx_; | ||
return true; | ||
} | ||
|
||
CopyToOutput(data); | ||
LOG_LINE << "Read frame (ReadRegularFrame), timestamp " << av_state_->frame_->pts << std::endl; | ||
++next_frame_idx_; | ||
return true; | ||
} | ||
|
@@ -257,10 +259,13 @@ bool FramesDecoder::ReadFlushFrame(uint8_t *data, bool copy_to_output) { | |
|
||
if (copy_to_output) { | ||
CopyToOutput(data); | ||
LOG_LINE << "Read frame (ReadFlushFrame), timestamp " << av_state_->frame_->pts << std::endl; | ||
} | ||
|
||
LOG_LINE << "Read frame (ReadFlushFrame), index " << next_frame_idx_ << " timestamp " << | ||
std::setw(5) << av_state_->frame_->pts << ", current copy " << copy_to_output << std::endl; | ||
Comment on lines
+264
to
+265
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I improved logging in |
||
++next_frame_idx_; | ||
|
||
// TODO(awolant): Figure out how to handle this during index building | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something I discovered during this task. Will be handled in the future, as this is minor inconvenience. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So how this is handled now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was always handled through checking the return value of the decoding function. |
||
if (next_frame_idx_ >= NumFrames()) { | ||
next_frame_idx_ = -1; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
|
||
#include <string> | ||
#include <memory> | ||
#include <iomanip> | ||
|
||
#include "dali/core/error_handling.h" | ||
#include "dali/core/cuda_utils.h" | ||
|
@@ -125,6 +126,10 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict | |
if (current_pts == NextFramePts()) { | ||
// Currently decoded frame is actually the one we wanted | ||
frame_returned_ = true; | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you need similar WAR in L112 as in https://github.com/NVIDIA/DALI/blob/main/dali/operators/reader/nvdecoder/nvdecoder.cc#L62. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " << | ||
std::setw(5) << current_pts << ", current copy " << current_copy_to_output_ << std::endl; | ||
Comment on lines
+130
to
+131
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I improved logging in |
||
|
||
if (current_copy_to_output_ == false) { | ||
return 1; | ||
} | ||
|
@@ -155,12 +160,17 @@ int FramesDecoderGpu::ProcessPictureDecode(void *user_data, CUVIDPICPARAMS *pict | |
Width(), | ||
Height(), | ||
stream_); | ||
// TODO(awolant): Alterantive is to copy the data to a buffer | ||
// and then process it on the stream. Check, if this is faster, when | ||
// the benchmark is ready. | ||
CUDA_CALL(cudaStreamSynchronize(stream_)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even though we pass |
||
CUDA_CALL(cuvidUnmapVideoFrame(nvdecode_state_->decoder, frame)); | ||
|
||
return 1; | ||
} | ||
|
||
void FramesDecoderGpu::SeekFrame(int frame_id) { | ||
// TODO(awolant): This seek can be optimized - for consecutive frames not needed etc. | ||
SendLastPacket(true); | ||
FramesDecoder::SeekFrame(frame_id); | ||
} | ||
|
@@ -177,6 +187,9 @@ bool FramesDecoderGpu::ReadNextFrame(uint8_t *data, bool copy_to_output) { | |
if (copy_to_output) { | ||
copyD2D(data, frame.frame_.data(), FrameSize()); | ||
} | ||
LOG_LINE << "Read frame, index " << next_frame_idx_ << ", timestamp " << | ||
std::setw(5) << frame.pts_ << ", current copy " << copy_to_output << std::endl; | ||
|
||
frame.pts_ = -1; | ||
|
||
++next_frame_idx_; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "dali/operators/reader/loader/video/video_loader_decoder_gpu.h" | ||
|
||
#include "dali/util/nvml.h" | ||
|
||
namespace dali { | ||
void VideoSampleGpu::Decode() { | ||
TensorShape<4> shape = { | ||
sequence_len_, | ||
video_file_->Height(), | ||
video_file_->Width(), | ||
video_file_->Channels()}; | ||
|
||
data_.Resize( | ||
shape, | ||
DALIDataType::DALI_UINT8); | ||
|
||
for (int i = 0; i < sequence_len_; ++i) { | ||
int frame_id = span_->start_ + i * span_->stride_; | ||
video_file_->SeekFrame(frame_id); | ||
video_file_->ReadNextFrame( | ||
static_cast<uint8_t *>(data_.raw_mutable_data()) + i * video_file_->FrameSize()); | ||
} | ||
} | ||
|
||
VideoLoaderDecoderGpu::~VideoLoaderDecoderGpu() { | ||
CUDA_DTOR_CALL(cudaStreamDestroy(cuda_stream_)); | ||
} | ||
|
||
cudaStream_t VideoLoaderDecoderGpu::GetCudaStream() { | ||
#if NVML_ENABLED | ||
{ | ||
nvml::Init(); | ||
static float driver_version = nvml::GetDriverVersion(); | ||
if (driver_version > 460 && driver_version < 470.21) { | ||
DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected."); | ||
return 0; | ||
} | ||
} | ||
#else | ||
{ | ||
int driver_cuda_version = 0; | ||
CUDA_CALL(cuDriverGetVersion(&driver_cuda_version)); | ||
if (driver_cuda_version >= 11030 && driver_cuda_version < 11040) { | ||
DALI_WARN_ONCE("Warning: Decoding on a default stream. Performance may be affected."); | ||
return 0; | ||
} | ||
} | ||
#endif | ||
|
||
// TODO(awolant): Check per decoder stream | ||
cudaStream_t stream; | ||
DeviceGuard dg(device_id_); | ||
CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider using
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't use |
||
return stream; | ||
} | ||
|
||
void VideoLoaderDecoderGpu::PrepareEmpty(VideoSampleGpu &sample) { | ||
sample = {}; | ||
} | ||
|
||
void VideoLoaderDecoderGpu::ReadSample(VideoSampleGpu &sample) { | ||
auto &sample_span = sample_spans_[current_index_]; | ||
|
||
// Bind sample to the video and span, so it can be decoded later | ||
sample.span_ = &sample_span; | ||
sample.video_file_ = &video_files_[sample_span.video_idx_]; | ||
sample.sequence_len_ = sequence_len_; | ||
|
||
if (has_labels_) { | ||
sample.label_ = labels_[sample_span.video_idx_]; | ||
} | ||
Comment on lines
+83
to
+85
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know it's not possible with current code, but what if this "sample" had a label in a previous iteration? we wouldn't be clearing it. How about:
being no-label a default value. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added |
||
|
||
++current_index_; | ||
MoveToNextShard(current_index_); | ||
} | ||
|
||
Index VideoLoaderDecoderGpu::SizeImpl() { | ||
return sample_spans_.size(); | ||
} | ||
|
||
void VideoLoaderDecoderGpu::PrepareMetadataImpl() { | ||
video_files_.reserve(filenames_.size()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. So we have input files number amount of FramesDecoderGpu instances (including decoder instances inside). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Solving this properly is part of DALI-2321 to be done when we have benchmark (DALI-2594). Before it is hard too tell anything about performance impact of any possible solution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is not about the perf, rather about resource constrains. I think creating 1000 decoders and parsers will consume a lot of resources. |
||
for (auto &filename : filenames_) { | ||
video_files_.emplace_back(filename, cuda_stream_); | ||
} | ||
|
||
for (size_t video_idx = 0; video_idx < video_files_.size(); ++video_idx) { | ||
for (int start = 0; | ||
start + stride_ * sequence_len_ <= video_files_[video_idx].NumFrames(); | ||
start += step_) { | ||
sample_spans_.push_back( | ||
VideoSampleDesc(start, start + stride_ * sequence_len_, stride_, video_idx)); | ||
} | ||
} | ||
if (shuffle_) { | ||
// seeded with hardcoded value to get | ||
// the same sequence on every shard | ||
std::mt19937 g(kDaliDataloaderSeed); | ||
std::shuffle(std::begin(sample_spans_), std::end(sample_spans_), g); | ||
} | ||
|
||
// set the initial index for each shard | ||
Reset(true); | ||
} | ||
|
||
void VideoLoaderDecoderGpu::Reset(bool wrap_to_shard) { | ||
current_index_ = wrap_to_shard ? start_index(shard_id_, num_shards_, SizeImpl()) : 0; | ||
} | ||
|
||
} // namespace dali |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_ | ||
#define DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_ | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
#include "dali/operators/reader/loader/loader.h" | ||
#include "dali/operators/reader/loader/video/video_loader_decoder_cpu.h" | ||
#include "dali/operators/reader/loader/video/frames_decoder_gpu.h" | ||
|
||
namespace dali { | ||
class VideoSampleGpu { | ||
public: | ||
void Decode(); | ||
|
||
FramesDecoderGpu *video_file_ = nullptr; | ||
VideoSampleDesc *span_ = nullptr; | ||
int sequence_len_ = 0; | ||
Tensor<GPUBackend> data_; | ||
int label_ = -1; | ||
}; | ||
|
||
|
||
class VideoLoaderDecoderGpu : public Loader<GPUBackend, VideoSampleGpu> { | ||
public: | ||
explicit inline VideoLoaderDecoderGpu(const OpSpec &spec) : | ||
Loader<GPUBackend, VideoSampleGpu>(spec), | ||
filenames_(spec.GetRepeatedArgument<std::string>("filenames")), | ||
sequence_len_(spec.GetArgument<int>("sequence_length")), | ||
stride_(spec.GetArgument<int>("stride")), | ||
step_(spec.GetArgument<int>("step")), | ||
cuda_stream_(GetCudaStream()) { | ||
if (step_ <= 0) { | ||
step_ = stride_ * sequence_len_; | ||
} | ||
has_labels_ = spec.TryGetRepeatedArgument(labels_, "labels"); | ||
} | ||
|
||
void ReadSample(VideoSampleGpu &sample) override; | ||
|
||
void PrepareEmpty(VideoSampleGpu &sample) override; | ||
|
||
~VideoLoaderDecoderGpu(); | ||
|
||
protected: | ||
Index SizeImpl() override; | ||
|
||
void PrepareMetadataImpl() override; | ||
|
||
private: | ||
void Reset(bool wrap_to_shard) override; | ||
|
||
cudaStream_t GetCudaStream(); | ||
|
||
std::vector<std::string> filenames_; | ||
std::vector<int> labels_; | ||
bool has_labels_ = false; | ||
std::vector<FramesDecoderGpu> video_files_; | ||
std::vector<VideoSampleDesc> sample_spans_; | ||
|
||
Index current_index_ = 0; | ||
|
||
int sequence_len_; | ||
int stride_; | ||
int step_; | ||
|
||
cudaStream_t cuda_stream_; | ||
}; | ||
|
||
} // namespace dali | ||
|
||
#endif // DALI_OPERATORS_READER_LOADER_VIDEO_VIDEO_LOADER_DECODER_GPU_H_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This fixes a build error that I discovered during implementing this task.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍