Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of experimental.resize #5662

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 68 additions & 80 deletions dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "dali/kernels/imgproc/resample/params.h"
#include "dali/operators/image/resize/resize_op_impl.h"
#include "dali/operators/nvcvop/nvcvop.h"
#include "dali/core/nvtx.h"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused?


namespace dali {

Expand All @@ -33,12 +34,13 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {

static_assert(spatial_ndim == 2 || spatial_ndim == 3, "Only 2D and 3D resizing is supported");


/// Dimensionality of each separate frame. If input contains no channel dimension, one is added
static constexpr int frame_ndim = spatial_ndim + 1;

void Setup(TensorListShape<> &out_shape, const TensorListShape<> &in_shape, int first_spatial_dim,
span<const kernels::ResamplingParams> params) override {
first_spatial_dim_ = first_spatial_dim;

// Calculate output shape of the input, as supplied (sequences, planar images, etc)
GetResizedShape(out_shape, in_shape, params, spatial_ndim, first_spatial_dim);

Expand All @@ -49,37 +51,46 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
// effective frames (from videos, channel planes, etc).
GetResizedShape(out_shape_, in_shape_, make_cspan(params_), 0);

// Create a map of non-empty samples
SetFrameIdxs();

// Now that we know how many logical frames there are, calculate batch subdivision.
CalculateMinibatchPartition(minibatch_size_);

CalculateSourceSamples(in_shape, first_spatial_dim);

SetupKernel();
}

// Set the frame_idx_ map with indices of samples that are not empty
void SetFrameIdxs() {
frame_idx_.clear();
frame_idx_.reserve(in_shape_.num_samples());
for (int i = 0; i < in_shape_.num_samples(); ++i) {
if (volume(out_shape_.tensor_shape_span(i)) != 0 &&
volume(in_shape_.tensor_shape_span(i)) != 0) {
frame_idx_.push_back(i);
// Assign each minibatch a range of frames in the original input/output TensorLists
void CalculateSourceSamples(const TensorListShape<> &original_shape, int first_spatial_dim) {
int64_t sample_id = 0;
int64_t frame_offset = 0;
for (auto &mb : minibatches_) {
auto v = original_shape[sample_id].num_elements();
while (v == 0) {
sample_id++;
v = original_shape[sample_id].num_elements();
}
mb.sample_offset = sample_id;
mb.frame_offset = frame_offset;
frame_offset = mb.frame_offset + mb.count;
int frames_n = num_frames(original_shape[sample_id], first_spatial_dim);
while (frame_offset >= frames_n) {
frame_offset -= frames_n;
if (++sample_id >= original_shape.num_samples()) {
break;
}
frames_n = num_frames(original_shape[sample_id], first_spatial_dim);
}
total_frames_ = frame_idx_.size();
}
}

// get the index of a frame in the DALI TensorList
int frame_idx(int f) {
return frame_idx_[f];
int64_t num_frames(const TensorShape<> &shape, int first_spatial_dim) {
return volume(&shape[0], &shape[first_spatial_dim]);
}

void SetupKernel() {
kernels::KernelContext ctx;
rois_.resize(total_frames_);
workspace_reqs_ = {};
workspace_reqs_[0] = {};
workspace_reqs_[1] = {};
std::vector<HQResizeTensorShapeI> mb_input_shapes(minibatch_size_);
std::vector<HQResizeTensorShapeI> mb_output_shapes(minibatch_size_);
auto *rois_ptr = rois_.data();
Expand All @@ -88,30 +99,29 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {

int end = mb.start + mb.count;
for (int i = mb.start, j = 0; i < end; i++, j++) {
auto f_id = frame_idx(i);
rois_ptr[j] = GetRoi(params_[f_id]);
rois_ptr[j] = GetRoi(params_[i]);
for (int d = 0; d < spatial_ndim; ++d) {
mb_input_shapes[j].extent[d] = static_cast<int32_t>(in_shape_.tensor_shape_span(f_id)[d]);
mb_input_shapes[j].extent[d] = static_cast<int32_t>(in_shape_.tensor_shape_span(i)[d]);
mb_output_shapes[j].extent[d] =
static_cast<int32_t>(out_shape_.tensor_shape_span(f_id)[d]);
static_cast<int32_t>(out_shape_.tensor_shape_span(i)[d]);
}
}
int num_channels = in_shape_[frame_idx(0)][frame_ndim - 1];
int num_channels = in_shape_[0][frame_ndim - 1];
HQResizeTensorShapesI mb_input_shape{mb_input_shapes.data(), mb.count, spatial_ndim,
num_channels};
HQResizeTensorShapesI mb_output_shape{mb_output_shapes.data(), mb.count, spatial_ndim,
num_channels};
mb.rois = HQResizeRoisF{mb.count, spatial_ndim, rois_ptr};
rois_ptr += mb.count;

auto param = params_[frame_idx(mb.start)][0];
auto param = params_[mb.start][0];
mb.min_interpolation = GetInterpolationType(param.min_filter);
mb.mag_interpolation = GetInterpolationType(param.mag_filter);
mb.antialias = param.min_filter.antialias || param.mag_filter.antialias;
auto ws_req = resize_op_.getWorkspaceRequirements(mb.count, mb_input_shape, mb_output_shape,
mb.min_interpolation, mb.mag_interpolation,
mb.antialias, mb.rois);
workspace_reqs_ = nvcvop::MaxWorkspaceRequirements(workspace_reqs_, ws_req);
workspace_reqs_[mb_idx % 2] = cvcuda::MaxWorkspaceReq(workspace_reqs_[mb_idx % 2], ws_req);
}
}

Expand Down Expand Up @@ -146,36 +156,43 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {

void RunResize(Workspace &ws, TensorList<GPUBackend> &output,
const TensorList<GPUBackend> &input) override {
TensorList<GPUBackend> in_frames;
in_frames.ShareData(input);
in_frames.Resize(in_shape_);
PrepareInput(in_frames);

TensorList<GPUBackend> out_frames;
out_frames.ShareData(output);
out_frames.Resize(out_shape_);
PrepareOutput(out_frames);


kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream()));
auto allocator = nvcvop::GetScratchpadAllocator(scratchpad);

auto workspace_mem = op_workspace_.Allocate(workspace_reqs_, scratchpad);
auto workspace_mem = AllocateWorkspaces(scratchpad);

for (size_t b = 0; b < minibatches_.size(); b++) {
MiniBatch &mb = minibatches_[b];
resize_op_(ws.stream(), workspace_mem, mb.input, mb.output, mb.min_interpolation,
auto reqs = nvcv::TensorBatch::CalcRequirements(mb.count);
auto mb_output = nvcv::TensorBatch(reqs, allocator);
auto mb_input = nvcv::TensorBatch(reqs, allocator);
nvcvop::PushFramesToBatch(mb_input, input, first_spatial_dim_, mb.sample_offset,
mb.frame_offset, mb.count, sample_layout_);
nvcvop::PushFramesToBatch(mb_output, output, first_spatial_dim_, mb.sample_offset,
mb.frame_offset, mb.count, sample_layout_);
Comment on lines +169 to +172
Copy link
Contributor

@mzient mzient Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bug. Both inputs and outputs should be inserted in one go and skipping the empty samples should be based solely on the output size. The user may request resizing a non-empty tensor to (0, 0), which is not an error AFAIR. Resizing an empty input to non-empty shape is an error and should be thrown at some point.

resize_op_(ws.stream(), workspace_mem[b % 2], mb_input, mb_output, mb.min_interpolation,
mb.mag_interpolation, mb.antialias, mb.rois);
}
}

std::array<cvcuda::Workspace, 2> AllocateWorkspaces(kernels::Scratchpad &scratchpad) {
std::array<cvcuda::Workspace, 2> result;
result[0] = op_workspace_.Allocate(workspace_reqs_[0], scratchpad);
if (minibatches_.size() > 1) {
result[1] = op_workspace_.Allocate(workspace_reqs_[1], scratchpad);
}
return result;
}

void CalculateMinibatchPartition(int minibatch_size) {
total_frames_ = in_shape_.num_samples();
std::vector<std::pair<int, int>> continuous_ranges;
kernels::FilterDesc min_filter_desc = params_[frame_idx(0)][0].min_filter;
kernels::FilterDesc mag_filter_desc = params_[frame_idx(0)][0].mag_filter;
kernels::FilterDesc min_filter_desc = params_[0][0].min_filter;
kernels::FilterDesc mag_filter_desc = params_[0][0].mag_filter;
int start_id = 0;
for (int i = 0; i < total_frames_; i++) {
if (params_[frame_idx(i)][0].min_filter != min_filter_desc ||
params_[frame_idx(i)][0].mag_filter != mag_filter_desc) {
if (params_[i][0].min_filter != min_filter_desc ||
params_[i][0].mag_filter != mag_filter_desc) {
// we break the range if different filter types are used
continuous_ranges.emplace_back(start_id, i);
start_id = i;
Expand Down Expand Up @@ -204,60 +221,31 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
}

TensorListShape<frame_ndim> in_shape_, out_shape_;
std::vector<int> frame_idx_; // map of absolute frame indices in the input TensorList
int total_frames_ = 0; // number of non-empty frames
int total_frames_; // number of non-empty frames

std::vector<ResamplingParamsND<spatial_ndim>> params_;
int first_spatial_dim_;

cvcuda::HQResize resize_op_{};
nvcvop::NVCVOpWorkspace op_workspace_;
cvcuda::WorkspaceRequirements workspace_reqs_{};
std::array<cvcuda::WorkspaceRequirements, 2> workspace_reqs_{};
std::vector<HQResizeRoiF> rois_;
const TensorLayout sample_layout_ = (spatial_ndim == 2) ? "HWC" : "DHWC";

std::vector<const void*> in_frames_;
std::vector<const void*> out_frames_;

struct MiniBatch {
int start, count;
nvcv::TensorBatch input;
nvcv::TensorBatch output;
NVCVInterpolationType min_interpolation;
NVCVInterpolationType mag_interpolation;
bool antialias;
HQResizeRoisF rois;
int64_t sample_offset; // id of a starting sample in the original IOs
int64_t frame_offset; // id of a starting frame in the starting sample
};

std::vector<MiniBatch> minibatches_;

void PrepareInput(const TensorList<GPUBackend> &input) {
for (auto &mb : minibatches_) {
int curr_capacity = mb.input ? mb.input.capacity() : 0;
if (mb.count > curr_capacity) {
int new_capacity = std::max(mb.count, curr_capacity * 2);
auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity);
mb.input = nvcv::TensorBatch(reqs);
} else {
mb.input.clear();
}
for (int i = mb.start; i < mb.start + mb.count; ++i) {
mb.input.pushBack(nvcvop::AsTensor(input[frame_idx(i)], sample_layout_));
}
}
}

void PrepareOutput(const TensorList<GPUBackend> &out) {
for (auto &mb : minibatches_) {
int curr_capacity = mb.output ? mb.output.capacity() : 0;
if (mb.count > curr_capacity) {
int new_capacity = std::max(mb.count, curr_capacity * 2);
auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity);
mb.output = nvcv::TensorBatch(reqs);
} else {
mb.output.clear();
}
for (int i = mb.start; i < mb.start + mb.count; ++i) {
mb.output.pushBack(nvcvop::AsTensor(out[frame_idx(i)], sample_layout_));
}
}
}

int minibatch_size_;
};

Expand Down
8 changes: 5 additions & 3 deletions dali/operators/image/resize/resize_op_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ void GetFrameShapesAndParams(

for (int i = 0; i < N; i++) {
auto in_sample_shape = in_shape.tensor_shape_span(i);
total_frames += volume(&in_sample_shape[0], &in_sample_shape[first_spatial_dim]);
if (volume(in_sample_shape) > 0)
total_frames += volume(&in_sample_shape[0], &in_sample_shape[first_spatial_dim]);
Comment on lines +66 to +67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bug - the emptiness of a frame depends on the output shape, not input. At least in old DALI resize, you can resize a non-empty frame to size 0. I understand that such samples should be skipped (both at input and output).
Resizing an empty frame to a non-zero shape is impossible and should throw.

}

frame_params.resize(total_frames);
Expand All @@ -72,10 +73,11 @@ void GetFrameShapesAndParams(
int ndim = in_shape.sample_dim();
for (int i = 0, flat_frame_idx = 0; i < N; i++) {
auto in_sample_shape = in_shape.tensor_shape_span(i);
if (volume(in_sample_shape) == 0) {
continue; // skip empty samples
}
// Collapse leading dimensions, if any, as frame dim. This handles channel-first.
int seq_len = volume(&in_sample_shape[0], &in_sample_shape[first_spatial_dim]);
if (seq_len == 0)
continue; // skip empty sequences
TensorShape<out_ndim> frame_shape;
frame_shape.resize(frame_ndim);

Expand Down
86 changes: 79 additions & 7 deletions dali/operators/nvcvop/nvcvop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

#include "dali/operators/nvcvop/nvcvop.h"


#include <string>
#include <utility>

namespace dali::nvcvop {

Expand Down Expand Up @@ -208,11 +208,11 @@ nvcv::Tensor AsTensor(ConstSampleView<GPUBackend> sample, TensorLayout layout,
return AsTensor(const_cast<void *>(sample.raw_data()), shape, sample.type(), layout);
}

nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDType,
nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType daliDType,
TensorLayout layout) {
auto dtype = GetDataType(daliDType, 1);
nvcv::TensorDataStridedCuda::Buffer inBuf;
inBuf.basePtr = reinterpret_cast<NVCVByte *>(const_cast<void *>(data));
inBuf.basePtr = static_cast<NVCVByte *>(const_cast<void *>(data));
inBuf.strides[shape.size() - 1] = dtype.strideBytes();
for (int d = shape.size() - 2; d >= 0; --d) {
inBuf.strides[d] = shape[d + 1] * inBuf.strides[d + 1];
Expand All @@ -225,13 +225,68 @@ nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDT
return nvcv::TensorWrapData(inData);
}

void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList<GPUBackend> &t_list,
TensorLayout layout) {
for (int s = 0; s < t_list.num_samples(); ++s) {
batch.pushBack(AsTensor(t_list[s], layout));
nvcv::Tensor AsTensor(const void *data, span<const int64_t> shape_data, const nvcv::DataType &dtype,
const nvcv::TensorLayout &layout) {
int ndim = shape_data.size();
nvcv::TensorDataStridedCuda::Buffer inBuf;
inBuf.basePtr = static_cast<NVCVByte *>(const_cast<void *>(data));
inBuf.strides[ndim - 1] = dtype.strideBytes();
for (int d = ndim - 2; d >= 0; --d) {
inBuf.strides[d] = shape_data[d + 1] * inBuf.strides[d + 1];
}
nvcv::TensorShape out_shape(shape_data.data(), ndim, layout);
nvcv::TensorDataStridedCuda inData(out_shape, dtype, inBuf);
return nvcv::TensorWrapData(inData);
}

int64_t calc_num_frames(const TensorShape<> &shape, int first_spatial_dim) {
return (first_spatial_dim > 0) ?
volume(&shape[0], &shape[first_spatial_dim]) :
1;
}

void PushFramesToBatch(nvcv::TensorBatch &batch, const TensorList<GPUBackend> &t_list,
int first_spatial_dim, int64_t starting_sample, int64_t frame_offset,
int64_t num_frames, const TensorLayout &layout) {
int ndim = layout.ndim();
auto nvcv_layout = nvcv::TensorLayout(layout.c_str());
auto dtype = GetDataType(t_list.type());

std::vector<nvcv::Tensor> tensors;
tensors.reserve(num_frames);

const auto &input_shape = t_list.shape();
int64_t sample_id = starting_sample - 1;
auto type_size = dtype.strideBytes();
std::vector<int64_t> frame_shape(ndim, 1);

auto frame_stride = 0;
int sample_nframes = 0;
const uint8_t *data = nullptr;

for (int64_t i = 0; i < num_frames; ++i) {
if (frame_offset == sample_nframes) {
frame_offset = 0;
do {
++sample_id;
auto sample_shape = input_shape[sample_id];
DALI_ENFORCE(sample_id < t_list.num_samples());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that either user or faulty data could trigger this - it would be an internal error, so I'd recommend using an assert or throwing logic_error at worst.

std::copy(&sample_shape[first_spatial_dim], &sample_shape[input_shape.sample_dim()],
frame_shape.begin());
frame_stride = volume(frame_shape) * type_size;
sample_nframes = calc_num_frames(sample_shape, first_spatial_dim);
} while (sample_nframes * frame_stride == 0); // we skip empty samples
data =
static_cast<const uint8_t *>(t_list.raw_tensor(sample_id)) + frame_stride * frame_offset;
}
tensors.push_back(AsTensor(data, make_span(frame_shape), dtype, nvcv_layout));
data += frame_stride;
frame_offset++;
Comment on lines +267 to +284
Copy link
Contributor

@mzient mzient Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that combining the two loops and changing the outer loop condition to "while there are frames left to insert" makes it more readable:

Suggested change
for (int64_t i = 0; i < num_frames; ++i) {
if (frame_offset == sample_nframes) {
frame_offset = 0;
do {
++sample_id;
auto sample_shape = input_shape[sample_id];
DALI_ENFORCE(sample_id < t_list.num_samples());
std::copy(&sample_shape[first_spatial_dim], &sample_shape[input_shape.sample_dim()],
frame_shape.begin());
frame_stride = volume(frame_shape) * type_size;
sample_nframes = calc_num_frames(sample_shape, first_spatial_dim);
} while (sample_nframes * frame_stride == 0); // we skip empty samples
data =
static_cast<const uint8_t *>(t_list.raw_tensor(sample_id)) + frame_stride * frame_offset;
}
tensors.push_back(AsTensor(data, make_span(frame_shape), dtype, nvcv_layout));
data += frame_stride;
frame_offset++;
int frames_left = num_frames;
while (frames_left) {
if (frame >= sample_nframes) {
++sample_id;
assert(sample_id < t_list.num_samples());
auto sample_shape = input_shape[sample_id];
std::copy(&sample_shape[first_spatial_dim], &sample_shape[input_shape.sample_dim()],
frame_shape.begin());
frame_stride = volume(frame_shape) * type_size;
if (frame_stride == 0) { // this sample is (effectively) empty - skip
sample_nframes = 0;
continue;
}
sample_nframes = calc_num_frames(sample_shape, first_spatial_dim);
data = static_cast<const uint8_t *>(t_list.raw_tensor(sample_id)) + frame_stride * frame_offset;
}
tensors.push_back(AsTensor(data, make_span(frame_shape), dtype, nvcv_layout));
data += frame_stride;
frame_offset++;
frames_left--;
}

}
batch.pushBack(tensors.begin(), tensors.end());
}


cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements &reqs,
kernels::Scratchpad &scratchpad) {
auto *hostBuffer = scratchpad.AllocateHost<uint8_t>(reqs.hostMem.size, reqs.hostMem.alignment);
Expand All @@ -248,4 +303,21 @@ cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements
return workspace_;
}

nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad) {
auto hostAllocator = nvcv::CustomHostMemAllocator(
[&](int64_t size, int32_t align) { return scratchpad.AllocateHost<uint8_t>(size, align); },
[](void *, int64_t, int32_t) {});

auto pinnedAllocator = nvcv::CustomHostPinnedMemAllocator(
[&](int64_t size, int32_t align) { return scratchpad.AllocatePinned<uint8_t>(size, align); },
[](void *, int64_t, int32_t) {});

auto gpuAllocator = nvcv::CustomCudaMemAllocator(
[&](int64_t size, int32_t align) { return scratchpad.AllocateGPU<uint8_t>(size, align); },
[](void *, int64_t, int32_t) {});

return nvcv::CustomAllocator(std::move(hostAllocator), std::move(pinnedAllocator),
std::move(gpuAllocator));
}

} // namespace dali::nvcvop
Loading
Loading