Skip to content

Commit

Permalink
Fix GPU buffer handling
Browse files Browse the repository at this point in the history
  • Loading branch information
Tabrizian committed May 29, 2023
1 parent 3336477 commit 00df006
Show file tree
Hide file tree
Showing 11 changed files with 236 additions and 112 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ set(
src/metric.cc
src/metric_family.h
src/metric_family.cc
src/gpu_buffers.cc
src/gpu_buffers.h
)

set(
Expand Down
84 changes: 84 additions & 0 deletions src/gpu_buffers.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "gpu_buffers.h"
#include "pb_string.h"

namespace triton { namespace backend { namespace python {
GPUBufferTransporter::GPUBufferTransporter()
{
completed_ = false;
}

void
GPUBufferTransporter::AddBuffer(
const bi::managed_external_buffer::handle_t& handle)
{
if (!completed_) {
buffers_.emplace_back(handle);
} else {
throw PythonBackendException(
"It is not possible to add buffers after 'Complete' has been called on "
"a GPUBufferTransporter.");
}
}

void
GPUBufferTransporter::Complete(
std::unique_ptr<SharedMemoryManager>& shm_pool, bool success,
const std::string& message)
{
if (completed_) {
return;
}
gpu_buffers_shm_ = shm_pool->Construct<GPUBuffersShm>();
if (success) {
buffers_handle_shm_ =
shm_pool->Construct<bi::managed_external_buffer::handle_t>(
buffers_.size());
gpu_buffers_shm_.data_->buffer_count = buffers_.size();
gpu_buffers_shm_.data_->success = true;
gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_;
for (size_t i = 0; i < buffers_.size(); ++i) {
buffers_handle_shm_.data_.get()[i] = buffers_[i];
}
} else {
// If there was an error we won't look at the buffers.
gpu_buffers_shm_.data_->success = false;
error_shm_ = PbString::Create(shm_pool, message);
gpu_buffers_shm_.data_->error = error_shm_->ShmHandle();
}
completed_ = true;
}


bi::managed_external_buffer::handle_t
GPUBufferTransporter::ShmHandle()
{
return gpu_buffers_shm_.handle_;
}

}}} // namespace triton::backend::python
64 changes: 64 additions & 0 deletions src/gpu_buffers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#pragma once

#include "pb_string.h"
#include "pb_utils.h"
#include "scoped_defer.h"

namespace triton { namespace backend { namespace python {

/// \param success indicating whether the request was successful
/// \param error if success is equal to false, the error object will be set.
/// \param buffers list of buffers elements.
/// \param buffer_count the number of buffers.
struct GPUBuffersShm {
bool success;
bi::managed_external_buffer::handle_t error;
bi::managed_external_buffer::handle_t buffers;
uint32_t buffer_count;
};

class GPUBufferTransporter {
public:
GPUBufferTransporter();
void AddBuffer(const bi::managed_external_buffer::handle_t& handle);
void Complete(
std::unique_ptr<SharedMemoryManager>& shm_pool, bool success = true,
const std::string& message = "");
bi::managed_external_buffer::handle_t ShmHandle();

private:
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm_;
std::vector<bi::managed_external_buffer::handle_t> buffers_;
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
buffers_handle_shm_;
std::unique_ptr<PbString> error_shm_;
bool completed_;
};

}}}; // namespace triton::backend::python
11 changes: 10 additions & 1 deletion src/infer_request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <boost/interprocess/sync/scoped_lock.hpp>

#include "gpu_buffers.h"
#include "pb_utils.h"
#include "scoped_defer.h"
#ifdef TRITON_PB_STUB
Expand Down Expand Up @@ -481,12 +482,20 @@ InferRequest::Exec(const bool is_decoupled)
// Additional round trip required for asking the stub process
// to fill in the GPU tensor buffers
if (has_gpu_tensor) {
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm =
shm_pool->Load<GPUBuffersShm>(
request_batch_shm_ptr->gpu_buffers_handle);
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
gpu_buffers_handle =
shm_pool->Load<bi::managed_external_buffer::handle_t>(
request_batch_shm_ptr->gpu_buffers_handle);
gpu_buffers_shm.data_->buffers);
try {
#ifdef TRITON_ENABLE_GPU
if (!gpu_buffers_shm.data_->success) {
std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
shm_pool, gpu_buffers_shm.data_->error);
throw PythonBackendException(error->String());
}
size_t i = 0;
for (auto& input_tensor : this->Inputs()) {
if (!input_tensor->IsCPU()) {
Expand Down
23 changes: 17 additions & 6 deletions src/infer_response.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ InferResponse::Send(
TRITONBACKEND_Response* response, void* cuda_stream,
bool& requires_deferred_callback, const uint32_t flags,
std::unique_ptr<SharedMemoryManager>& shm_pool,
GPUBufferTransporter& gpu_buffer_transporter,
std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
const std::set<std::string>& requested_output_names)
{
Expand All @@ -228,12 +229,20 @@ InferResponse::Send(

// Moves the response sending callback so that it is not called until the stub
// process fills in the GPU buffers.
ScopedDefer deferred_task(
[this, &requires_deferred_callback, &response_error_handling] {
if (requires_deferred_callback) {
deferred_send_callback_ = std::move(response_error_handling);
}
});
ScopedDefer deferred_task([this, &requires_deferred_callback,
&response_error_handling, &gpu_buffer_transporter,
response_error, &shm_pool] {
if (*response_error != nullptr) {
gpu_buffer_transporter.Complete(
shm_pool, false /* success */,
TRITONSERVER_ErrorMessage(*response_error));
} else {
gpu_buffer_transporter.Complete(shm_pool);
}
if (requires_deferred_callback) {
deferred_send_callback_ = std::move(response_error_handling);
}
});

if (HasError()) {
*response_error = TRITONSERVER_ErrorNew(
Expand Down Expand Up @@ -302,6 +311,7 @@ InferResponse::Send(
output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
true /* copy_gpu */));
}
gpu_buffer_transporter.AddBuffer(output_buffer->ShmHandle());
output_buffers.push_back({std::move(output_buffer), buffer});
#endif
}
Expand All @@ -316,6 +326,7 @@ InferResponse::Send(
shm_pool, actual_memory_type, actual_memory_type_id,
output_tensor->ByteSize(), nullptr /* data ptr */));

gpu_buffer_transporter.AddBuffer(output_buffer->ShmHandle());
output_buffers.push_back({std::move(output_buffer), buffer});
}

Expand Down
2 changes: 2 additions & 0 deletions src/infer_response.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#pragma once

#include <future>
#include "gpu_buffers.h"
#include "pb_error.h"
#include "pb_tensor.h"
#include "pb_utils.h"
Expand Down Expand Up @@ -100,6 +101,7 @@ class InferResponse {
TRITONBACKEND_Response* response, void* cuda_stream,
bool& requires_deferred_callback, const uint32_t flags,
std::unique_ptr<SharedMemoryManager>& shm_pool,
GPUBufferTransporter& gpu_buffer_transporter,
std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
const std::set<std::string>& requested_output_names = {});

Expand Down
52 changes: 30 additions & 22 deletions src/pb_stub.cc
Original file line number Diff line number Diff line change
Expand Up @@ -356,9 +356,10 @@ Stub::RunCommand()
LoadGPUBuffers(ipc_message);
}
catch (const PythonBackendException& pb_exception) {
LOG_INFO << "An error occurred while trying to load GPU buffers in the "
"Python backend stub: "
<< pb_exception.what() << std::endl;
LOG_ERROR
<< "An error occurred while trying to load GPU buffers in the "
"Python backend stub: "
<< pb_exception.what() << std::endl;
}

break;
Expand Down Expand Up @@ -539,43 +540,50 @@ Stub::ProcessResponse(InferResponse* response)
void
Stub::LoadGPUBuffers(std::unique_ptr<IPCMessage>& ipc_message)
{
AllocatedSharedMemory<char> gpu_buffers_handle =
shm_pool_->Load<char>(ipc_message->Args());
ScopedDefer load_gpu_buffer_response([this] {
// LoadGPUBuffers must let the parent process know when loading the
// buffers have been finished.
parent_message_queue_->Push(DUMMY_MESSAGE);
gpu_tensors_.clear();
});

uint64_t* gpu_buffer_count =
reinterpret_cast<uint64_t*>(gpu_buffers_handle.data_.get());
bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm =
reinterpret_cast<bi::managed_external_buffer::handle_t*>(
gpu_buffers_handle.data_.get() + sizeof(uint64_t));
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_handle =
shm_pool_->Load<GPUBuffersShm>(ipc_message->Args());

if (!gpu_buffers_handle.data_->success) {
std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
shm_pool_, gpu_buffers_handle.data_->error);
LOG_ERROR << ("Failed to load GPU buffers: " + error->String());
return;
}

uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count;
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
gpu_buffers_handle_shm =
shm_pool_->Load<bi::managed_external_buffer::handle_t>(
gpu_buffers_handle.data_->buffers);

if (gpu_tensors_.size() != *gpu_buffer_count) {
LOG_INFO
if (gpu_tensors_.size() != gpu_buffer_count) {
LOG_ERROR
<< (std::string(
"GPU buffers size does not match the provided buffers: ") +
std::to_string(gpu_tensors_.size()) +
" != " + std::to_string(*gpu_buffer_count));
" != " + std::to_string(gpu_buffer_count));
return;
}

std::vector<std::unique_ptr<PbMemory>> dst_buffers;

for (size_t i = 0; i < gpu_tensors_.size(); i++) {
std::unique_ptr<PbMemory> dst_buffer = PbMemory::LoadFromSharedMemory(
shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */);
shm_pool_, gpu_buffers_handle_shm.data_.get()[i],
true /* open_cuda_handle */);
dst_buffers.emplace_back(std::move(dst_buffer));
}

ScopedDefer load_gpu_buffer_response([this] {
// Push a dummy message to signal the thread to terminate.
parent_message_queue_->Push(DUMMY_MESSAGE);
});

for (size_t i = 0; i < gpu_tensors_.size(); i++) {
std::shared_ptr<PbTensor>& src_buffer = gpu_tensors_[i];
PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory());
}

gpu_tensors_.clear();
}

py::list
Expand Down
10 changes: 2 additions & 8 deletions src/pb_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,23 +212,17 @@ struct ResponseSenderBase {
struct ResponseSendMessage : ResponseSenderBase {
bi::managed_external_buffer::handle_t response;

// GPU Buffers handle
// A pointer to GPUBuffersShm object.
bi::managed_external_buffer::handle_t gpu_buffers_handle;

// GPU buffers count
uint32_t gpu_buffers_count;

uint32_t flags;
};

struct RequestBatch {
uint32_t batch_size;

// GPU Buffers handle
// A pointer to GPUBuffersShm object.
bi::managed_external_buffer::handle_t gpu_buffers_handle;

// GPU buffers count
uint32_t gpu_buffers_count;
};

#ifdef TRITON_ENABLE_GPU
Expand Down
Loading

0 comments on commit 00df006

Please sign in to comment.