Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize TensorList resizing. #5638

Merged
merged 2 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dali/pipeline/data/buffer.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,7 +63,7 @@ DLL_PUBLIC shared_ptr<uint8_t> AllocBuffer(size_t bytes, bool pinned,
}

DLL_PUBLIC bool RestrictPinnedMemUsage() {
static bool val = []() {
static const bool val = []() {
const char *env = getenv("DALI_RESTRICT_PINNED_MEM");
return env && atoi(env);
}();
Expand Down
17 changes: 10 additions & 7 deletions dali/pipeline/data/buffer.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -302,7 +302,7 @@ class DLL_PUBLIC Buffer {
return !!data_;
}

std::shared_ptr<void> get_data_ptr() const {
const std::shared_ptr<void> &get_data_ptr() const {
return data_;
}

Expand Down Expand Up @@ -549,7 +549,7 @@ class DLL_PUBLIC Buffer {
*
* @remark If order is empty, current order is used.
*/
inline void set_backing_allocation(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
inline void set_backing_allocation(shared_ptr<void> ptr, size_t bytes, bool pinned,
DALIDataType type, size_t size, int device_id,
AccessOrder order = {}) {
if (!same_managed_object(data_, ptr))
Expand All @@ -562,7 +562,7 @@ class DLL_PUBLIC Buffer {

// Fill the remaining members in the order as they appear in class.
type_ = TypeTable::GetTypeInfo(type);
data_ = ptr;
data_ = std::move(ptr);
allocate_ = {};
size_ = size;
shares_data_ = data_ != nullptr;
Expand Down Expand Up @@ -674,7 +674,10 @@ class DLL_PUBLIC Buffer {
static double growth_factor_;
static double shrink_threshold_;

static bool default_pinned();
static bool default_pinned() {
static const bool pinned = !RestrictPinnedMemUsage();
JanuszL marked this conversation as resolved.
Show resolved Hide resolved
return pinned;
}

TypeInfo type_ = {}; // Data type of underlying storage
shared_ptr<void> data_ = nullptr; // Pointer to underlying storage
Expand All @@ -683,8 +686,8 @@ class DLL_PUBLIC Buffer {
size_t num_bytes_ = 0; // To keep track of the true size of the underlying allocation
int device_ = CPU_ONLY_DEVICE_ID; // device the buffer was allocated on
AccessOrder order_ = AccessOrder::host(); // The order of memory access (host or device)
bool shares_data_ = false; // Whether we aren't using our own allocation
bool pinned_ = !RestrictPinnedMemUsage(); // Whether the allocation uses pinned memory
bool shares_data_ = false; // Whether we aren't using our own allocation
bool pinned_ = default_pinned(); // Whether the allocation uses pinned memory
};

template <typename Backend>
Expand Down
7 changes: 3 additions & 4 deletions dali/pipeline/data/tensor.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,7 +44,6 @@ class Tensor : public Buffer<Backend> {
inline Tensor() {}
inline ~Tensor() override = default;


/**
*
* @brief For tensor T of shape (s_0, s_1, ..., s_{n-1}) returns a n-1 dimensional tensor T'
Expand Down Expand Up @@ -226,7 +225,7 @@ class Tensor : public Buffer<Backend> {
* individually. The device_id describes the location of the memory and the order can describe
* the dependency on the work that is happening on another device.
*/
inline void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
inline void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorShape<> &shape, DALIDataType type, int device_id,
AccessOrder order = {}) {
Index new_size = volume(shape);
Expand All @@ -243,7 +242,7 @@ class Tensor : public Buffer<Backend> {

// Save our new pointer and bytes. Reset our type, shape, and size
type_ = TypeTable::GetTypeInfo(type);
data_ = ptr;
data_ = std::move(ptr);
size_ = new_size;
num_bytes_ = bytes;
device_ = device_id;
Expand Down
75 changes: 13 additions & 62 deletions dali/pipeline/data/tensor_list.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -301,7 +301,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const Tensor<Backend> &owner


template <typename Backend>
void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes,
void TensorList<Backend>::SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes,
bool pinned, const TensorShape<> &shape, DALIDataType type,
int device_id, AccessOrder order, const TensorLayout &layout) {
// Bounds check
Expand All @@ -316,7 +316,7 @@ void TensorList<Backend>::SetSample(int sample_idx, const shared_ptr<void> &ptr,

// Setting a new share overwrites the previous one - so we can safely assume that even if
// we had a sample sharing into TL, it will be overwritten
tensors_[sample_idx].ShareData(ptr, bytes, pinned, shape, type, device_id, order);
tensors_[sample_idx].ShareData(std::move(ptr), bytes, pinned, shape, type, device_id, order);
// As the order was simply copied over, we have to fix it back.
// We will be accessing it in order of this buffer, so we need to wait for all the work
// from the "incoming" src order.
Expand Down Expand Up @@ -460,13 +460,6 @@ std::vector<size_t> TensorList<Backend>::_chunks_capacity() const {
return result;
}


template <typename Backend>
const TensorListShape<> &TensorList<Backend>::shape() const & {
return shape_;
}


template <typename Backend>
void TensorList<Backend>::set_order(AccessOrder order, bool synchronize) {
DALI_ENFORCE(order, "Resetting order to an empty one is not supported");
Expand Down Expand Up @@ -529,6 +522,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
if (old_size < new_shape.num_samples()) {
tensors_.resize(new_shape.num_samples());
}

for (int i = old_size; i < new_shape.num_samples(); i++) {
setup_tensor_allocation(i);
}
Expand Down Expand Up @@ -575,6 +569,7 @@ void TensorList<Backend>::Resize(const TensorListShape<> &new_shape, DALIDataTyp
for (int i = 0; i < curr_num_tensors_; i++) {
tensors_[i].Resize(new_shape[i], new_type);
}

if (curr_num_tensors_ > 0) {
order_ = tensors_[0].order();
device_ = tensors_[0].device_id();
Expand Down Expand Up @@ -629,19 +624,6 @@ void TensorList<Backend>::set_type(DALIDataType new_type_id) {
}
}


template <typename Backend>
DALIDataType TensorList<Backend>::type() const {
return type_.id();
}


template <typename Backend>
const TypeInfo &TensorList<Backend>::type_info() const {
return type_;
}


template <typename Backend>
void TensorList<Backend>::SetLayout(const TensorLayout &layout) {
for (auto &t : tensors_) {
Expand All @@ -662,13 +644,6 @@ void TensorList<Backend>::SetSourceInfo(int idx, const std::string &source_info)
tensors_[idx].SetSourceInfo(source_info);
}


template <typename Backend>
TensorLayout TensorList<Backend>::GetLayout() const {
return layout_;
}


template <typename Backend>
const DALIMeta &TensorList<Backend>::GetMeta(int idx) const {
assert(idx < curr_num_tensors_);
Expand All @@ -695,13 +670,6 @@ void TensorList<Backend>::set_pinned(bool pinned) {
pinned_ = pinned;
}


template <typename Backend>
bool TensorList<Backend>::is_pinned() const {
return pinned_;
}


template <typename Backend>
void TensorList<Backend>::set_device_id(int device_id) {
contiguous_buffer_.set_device_id(device_id);
Expand All @@ -711,13 +679,6 @@ void TensorList<Backend>::set_device_id(int device_id) {
device_ = device_id;
}


template <typename Backend>
int TensorList<Backend>::device_id() const {
return device_;
}


template <typename Backend>
void TensorList<Backend>::reserve(size_t total_bytes) {
int batch_size_bkp = curr_num_tensors_;
Expand All @@ -744,30 +705,18 @@ void TensorList<Backend>::reserve(size_t bytes_per_sample, int batch_size) {
}
}


template <typename Backend>
bool TensorList<Backend>::IsContiguous() const noexcept {
return state_.IsContiguous();
}


template <typename Backend>
BatchContiguity TensorList<Backend>::GetContiguity() const noexcept {
return state_.Get();
}


template <typename Backend>
void TensorList<Backend>::recreate_views() {
// precondition: type, shape are configured
uint8_t *sample_ptr = static_cast<uint8_t *>(contiguous_buffer_.raw_mutable_data());
int64_t num_samples = shape().num_samples();
auto &data_ptr = contiguous_buffer_.get_data_ptr();
for (int64_t i = 0; i < num_samples; i++) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hoisting this line was perhaps the biggest saving here.

// or any other way
auto tensor_size = shape().tensor_size(i);

std::shared_ptr<void> sample_alias(contiguous_buffer_.get_data_ptr(), sample_ptr);
tensors_[i].ShareData(sample_alias, tensor_size * type_info().size(), is_pinned(), shape()[i],
tensors_[i].ShareData(std::shared_ptr<void>(data_ptr, sample_ptr),
tensor_size * type_info().size(), is_pinned(), shape()[i],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having an intermediate variable and moving it was noticeably slower (but still noticeably faster than passing by const-ref and copying).

type(), device_id(), order());
tensors_[i].SetLayout(GetLayout());
sample_ptr += tensor_size * type_info().size();
Expand Down Expand Up @@ -996,7 +945,8 @@ Tensor<Backend> TensorList<Backend>::AsReshapedTensor(const TensorShape<> &new_s
ptr = nullptr;
}

result.ShareData(ptr, capacity(), is_pinned(), new_shape, type(), device_id(), order());
result.ShareData(std::move(ptr), capacity(), is_pinned(),
new_shape, type(), device_id(), order());

auto result_layout = GetLayout();
if (result_layout.ndim() + 1 == new_shape.sample_dim()) {
Expand All @@ -1022,10 +972,11 @@ Tensor<Backend> TensorList<Backend>::AsTensor() {


template <typename Backend>
void TensorList<Backend>::ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
void TensorList<Backend>::ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorListShape<> &shape, DALIDataType type,
int device_id, AccessOrder order, const TensorLayout &layout) {
contiguous_buffer_.set_backing_allocation(ptr, bytes, pinned, type, shape.num_elements(),
contiguous_buffer_.set_backing_allocation(std::move(ptr), bytes, pinned,
type, shape.num_elements(),
device_id, order);
buffer_bkp_.reset();
tensors_.clear();
Expand Down
41 changes: 29 additions & 12 deletions dali/pipeline/data/tensor_list.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -143,7 +143,9 @@ class DLL_PUBLIC TensorList {
/**
* @brief Get the shape of the batch.
*/
const TensorListShape<> &shape() const &;
const TensorListShape<> &shape() const & {
return shape_;
}

/**
* @brief Get the shape of the sample.
Expand Down Expand Up @@ -273,7 +275,7 @@ class DLL_PUBLIC TensorList {
* We wait for the order of incoming sample in the order of the batch to allow correctly ordered
* access of the new sample.
*/
DLL_PUBLIC void SetSample(int sample_idx, const shared_ptr<void> &ptr, size_t bytes, bool pinned,
DLL_PUBLIC void SetSample(int sample_idx, shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorShape<> &shape, DALIDataType type, int device_id,
AccessOrder order, const TensorLayout &layout = "");
/** @} */
Expand Down Expand Up @@ -325,14 +327,18 @@ class DLL_PUBLIC TensorList {
/**
* @brief Get the type of samples in the batch.
*/
DALIDataType type() const;
DALIDataType type() const {
return type_.id();
}

/**
* @brief Get the TypeInfo of samples in the batch.
*
* @note Using DALIDataType via type() is recommended over accessing type_info().
*/
const TypeInfo &type_info() const;
const TypeInfo &type_info() const {
return type_;
}
/** @} */

/**
Expand Down Expand Up @@ -428,7 +434,10 @@ class DLL_PUBLIC TensorList {
/**
* @brief If the batch is backed by contiguous buffer
*/
bool IsContiguous() const noexcept;
bool IsContiguous() const noexcept {
return state_.IsContiguous();
}


/**
* @brief Pin the current state for further allocating calls like Resize() or set_type
Expand All @@ -440,7 +449,9 @@ class DLL_PUBLIC TensorList {
/**
* @brief Check the batch contiguity state.
*/
BatchContiguity GetContiguity() const noexcept;
BatchContiguity GetContiguity() const noexcept {
return state_.Get();
}

/**
* @brief Coalesce from individual samples to a contiguous buffer if the conditions are met.
Expand Down Expand Up @@ -472,7 +483,7 @@ class DLL_PUBLIC TensorList {
/**
* @brief Set the provided buffer as backing memory for this batch.
*/
DLL_PUBLIC void ShareData(const shared_ptr<void> &ptr, size_t bytes, bool pinned,
DLL_PUBLIC void ShareData(shared_ptr<void> ptr, size_t bytes, bool pinned,
const TensorListShape<> &shape, DALIDataType type, int device_id,
AccessOrder order = {}, const TensorLayout &layout = "");

Expand All @@ -483,11 +494,15 @@ class DLL_PUBLIC TensorList {

void set_pinned(bool pinned);

bool is_pinned() const;
bool is_pinned() const {
return pinned_;
}

void set_device_id(int device_id);

int device_id() const;
int device_id() const {
return device_;
}

bool has_data() const;

Expand Down Expand Up @@ -531,7 +546,9 @@ class DLL_PUBLIC TensorList {
/**
* @brief Get the layout of the sample in the batch.
*/
TensorLayout GetLayout() const;
TensorLayout GetLayout() const {
return layout_;
}

/**
* @brief Set cache metadata for given sample
Expand Down Expand Up @@ -817,7 +834,7 @@ class DLL_PUBLIC TensorList {
* Only allowed for contiguous batch, in typical scenario it is equivalent to
* unsafe_sample_owner(batch, 0)
*/
friend shared_ptr<void> unsafe_owner(TensorList<Backend> &batch) {
friend const shared_ptr<void> &unsafe_owner(TensorList<Backend> &batch) {
DALI_ENFORCE(batch.IsContiguous(),
"Data owner pointer can be obtain only for contiguous TensorList.");
return batch.contiguous_buffer_.get_data_ptr();
Expand Down
Loading
Loading