Skip to content

Commit

Permalink
Improve performance of experimental.resize
Browse files Browse the repository at this point in the history
Signed-off-by: Rafal Banas <rbanas@nvidia.com>
  • Loading branch information
banasraf committed Oct 7, 2024
1 parent f36f58f commit 0ff79a2
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 70 deletions.
80 changes: 29 additions & 51 deletions dali/operators/image/resize/experimental/resize_op_impl_cvcuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
}

void SetupKernel() {
kernels::KernelContext ctx;
rois_.resize(total_frames_);
workspace_reqs_ = {};
workspace_reqs_[0] = {};
workspace_reqs_[1] = {};
std::vector<HQResizeTensorShapeI> mb_input_shapes(minibatch_size_);
std::vector<HQResizeTensorShapeI> mb_output_shapes(minibatch_size_);
auto *rois_ptr = rois_.data();
Expand Down Expand Up @@ -111,7 +111,7 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {
auto ws_req = resize_op_.getWorkspaceRequirements(mb.count, mb_input_shape, mb_output_shape,
mb.min_interpolation, mb.mag_interpolation,
mb.antialias, mb.rois);
workspace_reqs_ = nvcvop::MaxWorkspaceRequirements(workspace_reqs_, ws_req);
workspace_reqs_[mb_idx % 2] = cvcuda::MaxWorkspaceReq(workspace_reqs_[mb_idx % 2], ws_req);
}
}

Expand Down Expand Up @@ -146,28 +146,38 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {

void RunResize(Workspace &ws, TensorList<GPUBackend> &output,
const TensorList<GPUBackend> &input) override {
TensorList<GPUBackend> in_frames;
in_frames.ShareData(input);
in_frames.Resize(in_shape_);
PrepareInput(in_frames);

TensorList<GPUBackend> out_frames;
out_frames.ShareData(output);
out_frames.Resize(out_shape_);
PrepareOutput(out_frames);
kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream()));
auto allocator = nvcvop::GetScratchpadAllocator(scratchpad);

in_frames_.ShareData(input);
in_frames_.Resize(in_shape_);

kernels::DynamicScratchpad scratchpad({}, AccessOrder(ws.stream()));
out_frames_.ShareData(output);
out_frames_.Resize(out_shape_);

auto workspace_mem = op_workspace_.Allocate(workspace_reqs_, scratchpad);
auto workspace_mem = AllocateWorkspaces(scratchpad);

for (size_t b = 0; b < minibatches_.size(); b++) {
MiniBatch &mb = minibatches_[b];
resize_op_(ws.stream(), workspace_mem, mb.input, mb.output, mb.min_interpolation,
auto reqs = nvcv::TensorBatch::CalcRequirements(mb.count);
auto mb_output = nvcv::TensorBatch(reqs, allocator);
auto mb_input = nvcv::TensorBatch(reqs, allocator);
nvcvop::PushTensorsToBatch(mb_input, in_frames_, mb.start, mb.count, sample_layout_);
nvcvop::PushTensorsToBatch(mb_output, out_frames_, mb.start, mb.count, sample_layout_);
resize_op_(ws.stream(), workspace_mem[b % 2], mb_input, mb_output, mb.min_interpolation,
mb.mag_interpolation, mb.antialias, mb.rois);
}
}

std::array<cvcuda::Workspace, 2> AllocateWorkspaces(kernels::Scratchpad &scratchpad) {
std::array<cvcuda::Workspace, 2> result;
result[0] = op_workspace_.Allocate(workspace_reqs_[0], scratchpad);
if (minibatches_.size() > 1) {
result[1] = op_workspace_.Allocate(workspace_reqs_[1], scratchpad);
}
return result;
}

void CalculateMinibatchPartition(int minibatch_size) {
std::vector<std::pair<int, int>> continuous_ranges;
kernels::FilterDesc min_filter_desc = params_[frame_idx(0)][0].min_filter;
Expand Down Expand Up @@ -210,54 +220,22 @@ class ResizeOpImplCvCuda : public ResizeBase<GPUBackend>::Impl {

cvcuda::HQResize resize_op_{};
nvcvop::NVCVOpWorkspace op_workspace_;
cvcuda::WorkspaceRequirements workspace_reqs_{};
std::array<cvcuda::WorkspaceRequirements, 2> workspace_reqs_{};
std::vector<HQResizeRoiF> rois_;
const TensorLayout sample_layout_ = (spatial_ndim == 2) ? "HWC" : "DHWC";

TensorList<GPUBackend> in_frames_;
TensorList<GPUBackend> out_frames_;

struct MiniBatch {
int start, count;
nvcv::TensorBatch input;
nvcv::TensorBatch output;
NVCVInterpolationType min_interpolation;
NVCVInterpolationType mag_interpolation;
bool antialias;
HQResizeRoisF rois;
};

std::vector<MiniBatch> minibatches_;

void PrepareInput(const TensorList<GPUBackend> &input) {
for (auto &mb : minibatches_) {
int curr_capacity = mb.input ? mb.input.capacity() : 0;
if (mb.count > curr_capacity) {
int new_capacity = std::max(mb.count, curr_capacity * 2);
auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity);
mb.input = nvcv::TensorBatch(reqs);
} else {
mb.input.clear();
}
for (int i = mb.start; i < mb.start + mb.count; ++i) {
mb.input.pushBack(nvcvop::AsTensor(input[frame_idx(i)], sample_layout_));
}
}
}

void PrepareOutput(const TensorList<GPUBackend> &out) {
for (auto &mb : minibatches_) {
int curr_capacity = mb.output ? mb.output.capacity() : 0;
if (mb.count > curr_capacity) {
int new_capacity = std::max(mb.count, curr_capacity * 2);
auto reqs = nvcv::TensorBatch::CalcRequirements(new_capacity);
mb.output = nvcv::TensorBatch(reqs);
} else {
mb.output.clear();
}
for (int i = mb.start; i < mb.start + mb.count; ++i) {
mb.output.pushBack(nvcvop::AsTensor(out[frame_idx(i)], sample_layout_));
}
}
}

int minibatch_size_;
};

Expand Down
54 changes: 49 additions & 5 deletions dali/operators/nvcvop/nvcvop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

#include "dali/operators/nvcvop/nvcvop.h"


#include <string>
#include <utility>

namespace dali::nvcvop {

Expand Down Expand Up @@ -208,7 +208,7 @@ nvcv::Tensor AsTensor(ConstSampleView<GPUBackend> sample, TensorLayout layout,
return AsTensor(const_cast<void *>(sample.raw_data()), shape, sample.type(), layout);
}

nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDType,
nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType daliDType,
TensorLayout layout) {
auto dtype = GetDataType(daliDType, 1);
nvcv::TensorDataStridedCuda::Buffer inBuf;
Expand All @@ -225,11 +225,38 @@ nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType daliDT
return nvcv::TensorWrapData(inData);
}

nvcv::Tensor AsTensor(const void *data, span<const int64_t> shape_data, const nvcv::DataType &dtype,
const nvcv::TensorLayout &layout) {
int ndim = shape_data.size();
nvcv::TensorDataStridedCuda::Buffer inBuf;
inBuf.basePtr = reinterpret_cast<NVCVByte *>(const_cast<void *>(data));
inBuf.strides[ndim - 1] = dtype.strideBytes();
for (int d = ndim - 2; d >= 0; --d) {
inBuf.strides[d] = shape_data[d + 1] * inBuf.strides[d + 1];
}
nvcv::TensorShape out_shape(shape_data.data(), ndim, layout);
nvcv::TensorDataStridedCuda inData(out_shape, dtype, inBuf);
return nvcv::TensorWrapData(inData);
}


void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList<GPUBackend> &t_list,
TensorLayout layout) {
for (int s = 0; s < t_list.num_samples(); ++s) {
batch.pushBack(AsTensor(t_list[s], layout));
int64_t start, int64_t count, const TensorLayout &layout) {
int ndim = t_list.sample_dim();
auto dtype = GetDataType(t_list.type(), 1);
TensorLayout out_layout = layout.empty() ? t_list.GetLayout() : layout;
DALI_ENFORCE(
out_layout.empty() || out_layout.size() == ndim,
make_string("Layout ", out_layout, " does not match the number of dimensions: ", ndim));
auto nvcv_layout = nvcv::TensorLayout(out_layout.c_str());
std::vector<nvcv::Tensor> tensors;
tensors.reserve(count);

for (int s = 0; s < count; ++s) {
tensors.push_back(AsTensor(t_list.raw_tensor(s + start), t_list.tensor_shape_span(s + start),
dtype, nvcv_layout));
}
batch.pushBack(tensors.begin(), tensors.end());
}

cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements &reqs,
Expand All @@ -248,4 +275,21 @@ cvcuda::Workspace NVCVOpWorkspace::Allocate(const cvcuda::WorkspaceRequirements
return workspace_;
}

nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad) {
auto hostAllocator = nvcv::CustomHostMemAllocator(
[&](int64_t size, int32_t align) { return scratchpad.AllocateHost<uint8_t>(size, align); },
[](void *, int64_t, int32_t) {});

auto pinnedAllocator = nvcv::CustomHostPinnedMemAllocator(
[&](int64_t size, int32_t align) { return scratchpad.AllocatePinned<uint8_t>(size, align); },
[](void *, int64_t, int32_t) {});

auto gpuAllocator = nvcv::CustomCudaMemAllocator(
[&](int64_t size, int32_t align) { return scratchpad.AllocateGPU<uint8_t>(size, align); },
[](void *, int64_t, int32_t) {});

return nvcv::CustomAllocator(std::move(hostAllocator), std::move(pinnedAllocator),
std::move(gpuAllocator));
}

} // namespace dali::nvcvop
26 changes: 12 additions & 14 deletions dali/operators/nvcvop/nvcvop.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <nvcv/DataType.h>
#include <nvcv/BorderType.h>
#include <cvcuda/Types.h>
#include <nvcv/alloc/Allocator.hpp>
#include <cvcuda/Workspace.hpp>
#include <nvcv/Tensor.hpp>
#include <nvcv/TensorBatch.hpp>
Expand All @@ -34,6 +35,7 @@
#include "dali/pipeline/operator/sequence_operator.h"
#include "dali/core/cuda_event_pool.h"


namespace dali::nvcvop {

/**
Expand Down Expand Up @@ -112,7 +114,7 @@ nvcv::Tensor AsTensor(SampleView<GPUBackend> sample, TensorLayout layout = "",
nvcv::Tensor AsTensor(ConstSampleView<GPUBackend> sample, TensorLayout layout = "",
const std::optional<TensorShape<>> &reshape = std::nullopt);

nvcv::Tensor AsTensor(void *data, const TensorShape<> shape, DALIDataType dtype,
nvcv::Tensor AsTensor(void *data, const TensorShape<> &shape, DALIDataType dtype,
TensorLayout layout);

/**
Expand All @@ -132,9 +134,12 @@ void AllocateImagesLike(nvcv::ImageBatchVarShape &output, const TensorList<GPUBa
*/
void PushImagesToBatch(nvcv::ImageBatchVarShape &batch, const TensorList<GPUBackend> &t_list);


/**
* @brief Push samples from a given tensor list to a given TensorBatch.
* [start, start+count) determines the range of samples in the TensorList that will be used.
*/
void PushTensorsToBatch(nvcv::TensorBatch &batch, const TensorList<GPUBackend> &t_list,
TensorLayout layout);
int64_t start, int64_t count, const TensorLayout &layout);

class NVCVOpWorkspace {
public:
Expand Down Expand Up @@ -165,17 +170,10 @@ class NVCVOpWorkspace {
int device_id_{};
};

inline cvcuda::WorkspaceRequirements MaxWorkspaceRequirements(
const cvcuda::WorkspaceRequirements &a, const cvcuda::WorkspaceRequirements &b) {
cvcuda::WorkspaceRequirements max;
max.hostMem.size = std::max(a.hostMem.size, b.hostMem.size);
max.hostMem.alignment = std::max(a.hostMem.alignment, b.hostMem.alignment);
max.pinnedMem.size = std::max(a.pinnedMem.size, b.pinnedMem.size);
max.pinnedMem.alignment = std::max(a.pinnedMem.alignment, b.pinnedMem.alignment);
max.cudaMem.size = std::max(a.cudaMem.size, b.cudaMem.size);
max.cudaMem.alignment = std::max(a.cudaMem.alignment, b.cudaMem.alignment);
return max;
}
/**
* @brief Create an NVCV allocator using the given scratchpad.
*/
nvcv::Allocator GetScratchpadAllocator(kernels::Scratchpad &scratchpad);

/**
* @brief A base class for the CVCUDA operators.
Expand Down

0 comments on commit 0ff79a2

Please sign in to comment.