Fix GPU buffer handling

triton-inference-server · May 29, 2023 · 00df006 · 00df006
1 parent 3336477
commit 00df006
Show file tree

Hide file tree

Showing 11 changed files with 236 additions and 112 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -163,6 +163,8 @@ set(
   src/metric.cc
   src/metric_family.h
   src/metric_family.cc
+  src/gpu_buffers.cc
+  src/gpu_buffers.h
 )
 
 set(

diff --git a/src/gpu_buffers.cc b/src/gpu_buffers.cc
@@ -0,0 +1,84 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpu_buffers.h"
+#include "pb_string.h"
+
+namespace triton { namespace backend { namespace python {
+GPUBufferTransporter::GPUBufferTransporter()
+{
+  completed_ = false;
+}
+
+void
+GPUBufferTransporter::AddBuffer(
+    const bi::managed_external_buffer::handle_t& handle)
+{
+  if (!completed_) {
+    buffers_.emplace_back(handle);
+  } else {
+    throw PythonBackendException(
+        "It is not possible to add buffers after 'Complete' has been called on "
+        "a GPUBufferTransporter.");
+  }
+}
+
+void
+GPUBufferTransporter::Complete(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, bool success,
+    const std::string& message)
+{
+  if (completed_) {
+    return;
+  }
+  gpu_buffers_shm_ = shm_pool->Construct<GPUBuffersShm>();
+  if (success) {
+    buffers_handle_shm_ =
+        shm_pool->Construct<bi::managed_external_buffer::handle_t>(
+            buffers_.size());
+    gpu_buffers_shm_.data_->buffer_count = buffers_.size();
+    gpu_buffers_shm_.data_->success = true;
+    gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_;
+    for (size_t i = 0; i < buffers_.size(); ++i) {
+      buffers_handle_shm_.data_.get()[i] = buffers_[i];
+    }
+  } else {
+    // If there was an error we won't look at the buffers.
+    gpu_buffers_shm_.data_->success = false;
+    error_shm_ = PbString::Create(shm_pool, message);
+    gpu_buffers_shm_.data_->error = error_shm_->ShmHandle();
+  }
+  completed_ = true;
+}
+
+
+bi::managed_external_buffer::handle_t
+GPUBufferTransporter::ShmHandle()
+{
+  return gpu_buffers_shm_.handle_;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/gpu_buffers.h b/src/gpu_buffers.h
@@ -0,0 +1,64 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "pb_string.h"
+#include "pb_utils.h"
+#include "scoped_defer.h"
+
+namespace triton { namespace backend { namespace python {
+
+/// \param success indicating whether the request was successful
+/// \param error if success is equal to false, the error object will be set.
+/// \param buffers list of buffers elements.
+/// \param buffer_count the number of buffers.
+struct GPUBuffersShm {
+  bool success;
+  bi::managed_external_buffer::handle_t error;
+  bi::managed_external_buffer::handle_t buffers;
+  uint32_t buffer_count;
+};
+
+class GPUBufferTransporter {
+ public:
+  GPUBufferTransporter();
+  void AddBuffer(const bi::managed_external_buffer::handle_t& handle);
+  void Complete(
+      std::unique_ptr<SharedMemoryManager>& shm_pool, bool success = true,
+      const std::string& message = "");
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+ private:
+  AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm_;
+  std::vector<bi::managed_external_buffer::handle_t> buffers_;
+  AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+      buffers_handle_shm_;
+  std::unique_ptr<PbString> error_shm_;
+  bool completed_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -28,6 +28,7 @@
 
 #include <boost/interprocess/sync/scoped_lock.hpp>
 
+#include "gpu_buffers.h"
 #include "pb_utils.h"
 #include "scoped_defer.h"
 #ifdef TRITON_PB_STUB
@@ -481,12 +482,20 @@ InferRequest::Exec(const bool is_decoupled)
     // Additional round trip required for asking the stub process
     // to fill in the GPU tensor buffers
     if (has_gpu_tensor) {
+      AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm =
+          shm_pool->Load<GPUBuffersShm>(
+              request_batch_shm_ptr->gpu_buffers_handle);
       AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
           gpu_buffers_handle =
               shm_pool->Load<bi::managed_external_buffer::handle_t>(
-                  request_batch_shm_ptr->gpu_buffers_handle);
+                  gpu_buffers_shm.data_->buffers);
       try {
 #ifdef TRITON_ENABLE_GPU
+        if (!gpu_buffers_shm.data_->success) {
+          std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+              shm_pool, gpu_buffers_shm.data_->error);
+          throw PythonBackendException(error->String());
+        }
         size_t i = 0;
         for (auto& input_tensor : this->Inputs()) {
           if (!input_tensor->IsCPU()) {

diff --git a/src/infer_response.cc b/src/infer_response.cc
@@ -206,6 +206,7 @@ InferResponse::Send(
     TRITONBACKEND_Response* response, void* cuda_stream,
     bool& requires_deferred_callback, const uint32_t flags,
     std::unique_ptr<SharedMemoryManager>& shm_pool,
+    GPUBufferTransporter& gpu_buffer_transporter,
     std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
     const std::set<std::string>& requested_output_names)
 {
@@ -228,12 +229,20 @@ InferResponse::Send(
 
   // Moves the response sending callback so that it is not called until the stub
   // process fills in the GPU buffers.
-  ScopedDefer deferred_task(
-      [this, &requires_deferred_callback, &response_error_handling] {
-        if (requires_deferred_callback) {
-          deferred_send_callback_ = std::move(response_error_handling);
-        }
-      });
+  ScopedDefer deferred_task([this, &requires_deferred_callback,
+                             &response_error_handling, &gpu_buffer_transporter,
+                             response_error, &shm_pool] {
+    if (*response_error != nullptr) {
+      gpu_buffer_transporter.Complete(
+          shm_pool, false /* success */,
+          TRITONSERVER_ErrorMessage(*response_error));
+    } else {
+      gpu_buffer_transporter.Complete(shm_pool);
+    }
+    if (requires_deferred_callback) {
+      deferred_send_callback_ = std::move(response_error_handling);
+    }
+  });
 
   if (HasError()) {
     *response_error = TRITONSERVER_ErrorNew(
@@ -302,6 +311,7 @@ InferResponse::Send(
                 output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
                 true /* copy_gpu */));
       }
+      gpu_buffer_transporter.AddBuffer(output_buffer->ShmHandle());
       output_buffers.push_back({std::move(output_buffer), buffer});
 #endif
     }
@@ -316,6 +326,7 @@ InferResponse::Send(
               shm_pool, actual_memory_type, actual_memory_type_id,
               output_tensor->ByteSize(), nullptr /* data ptr */));
 
+      gpu_buffer_transporter.AddBuffer(output_buffer->ShmHandle());
       output_buffers.push_back({std::move(output_buffer), buffer});
     }
 

diff --git a/src/infer_response.h b/src/infer_response.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include <future>
+#include "gpu_buffers.h"
 #include "pb_error.h"
 #include "pb_tensor.h"
 #include "pb_utils.h"
@@ -100,6 +101,7 @@ class InferResponse {
       TRITONBACKEND_Response* response, void* cuda_stream,
       bool& requires_deferred_callback, const uint32_t flags,
       std::unique_ptr<SharedMemoryManager>& shm_pool,
+      GPUBufferTransporter& gpu_buffer_transporter,
       std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
       const std::set<std::string>& requested_output_names = {});
 

diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -356,9 +356,10 @@ Stub::RunCommand()
         LoadGPUBuffers(ipc_message);
       }
       catch (const PythonBackendException& pb_exception) {
-        LOG_INFO << "An error occurred while trying to load GPU buffers in the "
-                    "Python backend stub: "
-                 << pb_exception.what() << std::endl;
+        LOG_ERROR
+            << "An error occurred while trying to load GPU buffers in the "
+               "Python backend stub: "
+            << pb_exception.what() << std::endl;
       }
 
       break;
@@ -539,43 +540,50 @@ Stub::ProcessResponse(InferResponse* response)
 void
 Stub::LoadGPUBuffers(std::unique_ptr<IPCMessage>& ipc_message)
 {
-  AllocatedSharedMemory<char> gpu_buffers_handle =
-      shm_pool_->Load<char>(ipc_message->Args());
+  ScopedDefer load_gpu_buffer_response([this] {
+    // LoadGPUBuffers must let the parent process know when loading the
+    // buffers have been finished.
+    parent_message_queue_->Push(DUMMY_MESSAGE);
+    gpu_tensors_.clear();
+  });
 
-  uint64_t* gpu_buffer_count =
-      reinterpret_cast<uint64_t*>(gpu_buffers_handle.data_.get());
-  bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm =
-      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
-          gpu_buffers_handle.data_.get() + sizeof(uint64_t));
+  AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_handle =
+      shm_pool_->Load<GPUBuffersShm>(ipc_message->Args());
+
+  if (!gpu_buffers_handle.data_->success) {
+    std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+        shm_pool_, gpu_buffers_handle.data_->error);
+    LOG_ERROR << ("Failed to load GPU buffers: " + error->String());
+    return;
+  }
+
+  uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count;
+  AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+      gpu_buffers_handle_shm =
+          shm_pool_->Load<bi::managed_external_buffer::handle_t>(
+              gpu_buffers_handle.data_->buffers);
 
-  if (gpu_tensors_.size() != *gpu_buffer_count) {
-    LOG_INFO
+  if (gpu_tensors_.size() != gpu_buffer_count) {
+    LOG_ERROR
         << (std::string(
                 "GPU buffers size does not match the provided buffers: ") +
             std::to_string(gpu_tensors_.size()) +
-            " != " + std::to_string(*gpu_buffer_count));
+            " != " + std::to_string(gpu_buffer_count));
     return;
   }
 
   std::vector<std::unique_ptr<PbMemory>> dst_buffers;
-
   for (size_t i = 0; i < gpu_tensors_.size(); i++) {
     std::unique_ptr<PbMemory> dst_buffer = PbMemory::LoadFromSharedMemory(
-        shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */);
+        shm_pool_, gpu_buffers_handle_shm.data_.get()[i],
+        true /* open_cuda_handle */);
     dst_buffers.emplace_back(std::move(dst_buffer));
   }
 
-  ScopedDefer load_gpu_buffer_response([this] {
-    // Push a dummy message to signal the thread to terminate.
-    parent_message_queue_->Push(DUMMY_MESSAGE);
-  });
-
   for (size_t i = 0; i < gpu_tensors_.size(); i++) {
     std::shared_ptr<PbTensor>& src_buffer = gpu_tensors_[i];
     PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory());
   }
-
-  gpu_tensors_.clear();
 }
 
 py::list

diff --git a/src/pb_utils.h b/src/pb_utils.h
@@ -212,23 +212,17 @@ struct ResponseSenderBase {
 struct ResponseSendMessage : ResponseSenderBase {
   bi::managed_external_buffer::handle_t response;
 
-  // GPU Buffers handle
+  // A pointer to GPUBuffersShm object.
   bi::managed_external_buffer::handle_t gpu_buffers_handle;
 
-  // GPU buffers count
-  uint32_t gpu_buffers_count;
-
   uint32_t flags;
 };
 
 struct RequestBatch {
   uint32_t batch_size;
 
-  // GPU Buffers handle
+  // A pointer to GPUBuffersShm object.
   bi::managed_external_buffer::handle_t gpu_buffers_handle;
-
-  // GPU buffers count
-  uint32_t gpu_buffers_count;
 };
 
 #ifdef TRITON_ENABLE_GPU