Fix error handling for GPU tensors (#249)

* Fix error handling for GPU tensors * Fix GPU buffer handling * Review edit * Fix for dynamically batched responses with GPU tensor * Review edits * Fix unused i variable for GPU=OFF * Review comments * Review edit
triton-inference-server · Jun 6, 2023 · 0a54e59 · 0a54e59
1 parent 637c7e3
commit 0a54e59
Show file tree

Hide file tree

Showing 10 changed files with 280 additions and 172 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -163,6 +163,8 @@ set(
   src/metric.cc
   src/metric_family.h
   src/metric_family.cc
+  src/gpu_buffers.cc
+  src/gpu_buffers.h
 )
 
 set(

diff --git a/src/gpu_buffers.cc b/src/gpu_buffers.cc
@@ -0,0 +1,88 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpu_buffers.h"
+#include "pb_string.h"
+
+namespace triton { namespace backend { namespace python {
+GPUBuffersHelper::GPUBuffersHelper()
+{
+  completed_ = false;
+}
+
+void
+GPUBuffersHelper::AddBuffer(const bi::managed_external_buffer::handle_t& handle)
+{
+  if (completed_) {
+    throw PythonBackendException(
+        "It is not possible to add buffers after 'Complete' has been called on "
+        "a GPUBuffersHelper.");
+  }
+
+  buffers_.emplace_back(handle);
+}
+
+void
+GPUBuffersHelper::SetError(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error)
+{
+  error_shm_ = PbString::Create(shm_pool, error);
+}
+
+void
+GPUBuffersHelper::Complete(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  if (completed_) {
+    throw PythonBackendException(
+        "Complete has already been called. Complete should only be called "
+        "once.");
+  }
+  gpu_buffers_shm_ = shm_pool->Construct<GPUBuffersShm>();
+  if (!error_shm_) {
+    buffers_handle_shm_ =
+        shm_pool->Construct<bi::managed_external_buffer::handle_t>(
+            buffers_.size());
+    gpu_buffers_shm_.data_->buffer_count = buffers_.size();
+    gpu_buffers_shm_.data_->success = true;
+    gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_;
+    for (size_t i = 0; i < buffers_.size(); ++i) {
+      buffers_handle_shm_.data_.get()[i] = buffers_[i];
+    }
+  } else {
+    gpu_buffers_shm_.data_->success = false;
+    gpu_buffers_shm_.data_->error = error_shm_->ShmHandle();
+  }
+  completed_ = true;
+}
+
+
+bi::managed_external_buffer::handle_t
+GPUBuffersHelper::ShmHandle()
+{
+  return gpu_buffers_shm_.handle_;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/gpu_buffers.h b/src/gpu_buffers.h
@@ -0,0 +1,67 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "pb_string.h"
+#include "pb_utils.h"
+#include "scoped_defer.h"
+
+namespace triton { namespace backend { namespace python {
+
+/// \param success indicating whether the process of fetching the GPU buffers
+/// was successful.
+/// \param error if success is equal to false, the error object will be set.
+/// \param buffers list of buffers elements.
+/// \param buffer_count the number of buffers.
+struct GPUBuffersShm {
+  bool success;
+  bi::managed_external_buffer::handle_t error;
+  bi::managed_external_buffer::handle_t buffers;
+  uint32_t buffer_count;
+};
+
+/// Helper class to facilitate transfer of metadata associated
+/// the GPU buffers in shared memory.
+class GPUBuffersHelper {
+ public:
+  GPUBuffersHelper();
+  void AddBuffer(const bi::managed_external_buffer::handle_t& handle);
+  void Complete(std::unique_ptr<SharedMemoryManager>& shm_pool);
+  void SetError(
+      std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error);
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+ private:
+  AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm_;
+  std::vector<bi::managed_external_buffer::handle_t> buffers_;
+  AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+      buffers_handle_shm_;
+  std::unique_ptr<PbString> error_shm_;
+  bool completed_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -28,6 +28,7 @@
 
 #include <boost/interprocess/sync/scoped_lock.hpp>
 
+#include "gpu_buffers.h"
 #include "pb_utils.h"
 #include "scoped_defer.h"
 #ifdef TRITON_PB_STUB
@@ -481,11 +482,19 @@ InferRequest::Exec(const bool is_decoupled)
     // Additional round trip required for asking the stub process
     // to fill in the GPU tensor buffers
     if (has_gpu_tensor) {
+      AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm =
+          shm_pool->Load<GPUBuffersShm>(
+              request_batch_shm_ptr->gpu_buffers_handle);
       AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
           gpu_buffers_handle =
               shm_pool->Load<bi::managed_external_buffer::handle_t>(
-                  request_batch_shm_ptr->gpu_buffers_handle);
+                  gpu_buffers_shm.data_->buffers);
       try {
+        if (!gpu_buffers_shm.data_->success) {
+          std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+              shm_pool, gpu_buffers_shm.data_->error);
+          throw PythonBackendException(error->String());
+        }
 #ifdef TRITON_ENABLE_GPU
         size_t i = 0;
         for (auto& input_tensor : this->Inputs()) {

diff --git a/src/infer_response.cc b/src/infer_response.cc
@@ -201,64 +201,50 @@ InferResponse::IsLastResponse()
 }
 
 #ifndef TRITON_PB_STUB
-std::shared_ptr<TRITONSERVER_Error*>
+void
 InferResponse::Send(
-    TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream,
+    TRITONBACKEND_Response* response, void* cuda_stream,
     bool& requires_deferred_callback, const uint32_t flags,
     std::unique_ptr<SharedMemoryManager>& shm_pool,
+    GPUBuffersHelper& gpu_buffer_helper,
     std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
-    const std::set<std::string>& requested_output_names,
-    TRITONBACKEND_Response* response)
+    const std::set<std::string>& requested_output_names)
 {
   std::shared_ptr<TRITONSERVER_Error*> response_error =
       WrapTritonErrorInSharedPtr(nullptr);
   std::unique_ptr<ScopedDefer> response_error_handling;
   requires_deferred_callback = false;
 
-  // Should only destruct the response factory whenever a response factory is
-  // being created.
-  bool destruct_response_factor = (response == nullptr);
-
-  if (response == nullptr) {
-    SET_ERROR_AND_RETURN(
-        response_error,
-        TRITONBACKEND_ResponseNewFromFactory(&response, response_factory));
-  }
-
   // This lambda expression will be called when this function exits, if the
   // inference response doesn't have any GPU tensors. Otherwise, it will be
   // called when the object is destructed or DeferredSendCallback is called.
-  response_error_handling = std::make_unique<ScopedDefer>(
-      [response, response_error, flags, response_factory,
-       destruct_response_factor] {
+  response_error_handling =
+      std::make_unique<ScopedDefer>([response, response_error, flags] {
         if (response != nullptr) {
           LOG_IF_ERROR(
               TRITONBACKEND_ResponseSend(response, flags, *response_error),
               "failed to send the response.");
-          if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL &&
-              destruct_response_factor) {
-            std::unique_ptr<
-                TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
-            response_factory_ptr(
-                reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
-                    response_factory));
-          }
         }
       });
 
   // Moves the response sending callback so that it is not called until the stub
   // process fills in the GPU buffers.
-  ScopedDefer deferred_task(
-      [this, &requires_deferred_callback, &response_error_handling] {
-        if (requires_deferred_callback) {
-          deferred_send_callback_ = std::move(response_error_handling);
-        }
-      });
+  ScopedDefer deferred_task([this, &requires_deferred_callback,
+                             &response_error_handling, &gpu_buffer_helper,
+                             response_error, &shm_pool] {
+    if (*response_error != nullptr) {
+      gpu_buffer_helper.SetError(
+          shm_pool, TRITONSERVER_ErrorMessage(*response_error));
+    }
+    if (requires_deferred_callback) {
+      deferred_send_callback_ = std::move(response_error_handling);
+    }
+  });
 
   if (HasError()) {
     *response_error = TRITONSERVER_ErrorNew(
         TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str());
-    return nullptr;
+    return;
   }
 
   bool cuda_copy = false;
@@ -322,6 +308,7 @@ InferResponse::Send(
                 output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
                 true /* copy_gpu */));
       }
+      gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
       output_buffers.push_back({std::move(output_buffer), buffer});
 #endif
     }
@@ -336,6 +323,7 @@ InferResponse::Send(
               shm_pool, actual_memory_type, actual_memory_type_id,
               output_tensor->ByteSize(), nullptr /* data ptr */));
 
+      gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
       output_buffers.push_back({std::move(output_buffer), buffer});
     }
 
@@ -357,8 +345,6 @@ InferResponse::Send(
     cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));
   }
 #endif  // TRITON_ENABLE_GPU
-
-  return response_error;
 }
 #endif
 

diff --git a/src/infer_response.h b/src/infer_response.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include <future>
+#include "gpu_buffers.h"
 #include "pb_error.h"
 #include "pb_tensor.h"
 #include "pb_utils.h"
@@ -49,7 +50,7 @@ struct ResponseShm {
     TRITONSERVER_Error* raasnie_err__ = (X); \
     if (raasnie_err__ != nullptr) {          \
       *E = raasnie_err__;                    \
-      return E;                              \
+      return;                                \
     }                                        \
   } while (false)
 
@@ -62,7 +63,7 @@ struct ResponseShm {
       TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \
           TRITONSERVER_ERROR_INTERNAL, pb_exception.what());   \
       *E = rarie_err__;                                        \
-      return E;                                                \
+      return;                                                  \
     }                                                          \
   } while (false)
 
@@ -96,13 +97,13 @@ class InferResponse {
   /// response needs to be done in two step. The boolean
   /// 'requires_deferred_callback' indicates whether DeferredSendCallback method
   /// should be called or not.
-  std::shared_ptr<TRITONSERVER_Error*> Send(
-      TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream,
+  void Send(
+      TRITONBACKEND_Response* response, void* cuda_stream,
       bool& requires_deferred_callback, const uint32_t flags,
       std::unique_ptr<SharedMemoryManager>& shm_pool,
+      GPUBuffersHelper& gpu_buffer_helper,
       std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
-      const std::set<std::string>& requested_output_names = {},
-      TRITONBACKEND_Response* response = nullptr);
+      const std::set<std::string>& requested_output_names = {});
 
   void DeferredSendCallback();
 #endif