apache · pearu · Oct 4, 2018 · Oct 4, 2018 · Oct 4, 2018 · Oct 4, 2018
diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc
@@ -44,11 +44,20 @@ class CudaContext::CudaContextImpl {
 
   Status Init(const CudaDevice& device) {
     device_ = device;
+    own_context_ = true;
     CU_RETURN_NOT_OK(cuCtxCreate(&context_, 0, device_.handle));
     is_open_ = true;
     return Status::OK();
   }
 
+  Status InitShared(const CudaDevice& device, CUcontext ctx) {
+    device_ = device;
+    own_context_ = false;
+    context_ = ctx;
+    is_open_ = true;
+    return Status::OK();
+  }
+
   Status Close() {
     if (is_open_ && own_context_) {
       CU_RETURN_NOT_OK(cuCtxDestroy(context_));
@@ -110,6 +119,8 @@ class CudaContext::CudaContextImpl {
 
   const CudaDevice device() const { return device_; }
 
+  const void* context_handle() const { return reinterpret_cast<void*>(context_); }
+
  private:
   CudaDevice device_;
   CUcontext context_;
@@ -165,6 +176,13 @@ class CudaDeviceManager::CudaDeviceManagerImpl {
     return (*out)->impl_->Init(devices_[device_number]);
   }
 
+  Status CreateSharedContext(int device_number, CUcontext ctx,
+                             std::shared_ptr<CudaContext>* out) {
+    // TODO: check if context exists already, if so, return it.
+    *out = std::shared_ptr<CudaContext>(new CudaContext());
+    return (*out)->impl_->InitShared(devices_[device_number], ctx);
+  }
+
   Status GetContext(int device_number, std::shared_ptr<CudaContext>* out) {
     auto it = contexts_.find(device_number);
     if (it == contexts_.end()) {
@@ -212,6 +230,11 @@ Status CudaDeviceManager::CreateNewContext(int device_number,
   return impl_->CreateNewContext(device_number, out);
 }
 
+Status CudaDeviceManager::CreateSharedContext(int device_number, void* ctx,
+                                              std::shared_ptr<CudaContext>* out) {
+  return impl_->CreateSharedContext(device_number, (CUcontext)ctx, out);
+}
+
 Status CudaDeviceManager::AllocateHost(int64_t nbytes,
                                        std::shared_ptr<CudaHostBuffer>* out) {
   uint8_t* data = nullptr;
@@ -240,6 +263,12 @@ Status CudaContext::Allocate(int64_t nbytes, std::shared_ptr<CudaBuffer>* out) {
   return Status::OK();
 }
 
+Status CudaContext::View(uint8_t* data, int64_t nbytes,
+                         std::shared_ptr<CudaBuffer>* out) {
+  *out = std::make_shared<CudaBuffer>(data, nbytes, this->shared_from_this(), false);
+  return Status::OK();
+}
+
 Status CudaContext::ExportIpcBuffer(void* data,
                                     std::shared_ptr<CudaIpcMemHandle>* handle) {
   return impl_->ExportIpcBuffer(data, handle);
@@ -276,5 +305,7 @@ Status CudaContext::OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle,
 
 int64_t CudaContext::bytes_allocated() const { return impl_->bytes_allocated(); }
 
+const void* CudaContext::handle() const { return impl_->context_handle(); }
+
 }  // namespace gpu
 }  // namespace arrow
diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h
@@ -44,6 +44,13 @@ class ARROW_EXPORT CudaDeviceManager {
   /// In general code will use GetContext
   Status CreateNewContext(int gpu_number, std::shared_ptr<CudaContext>* ctx);
 
+  /// \brief Create shared context for a given device number
+  /// \param[in] device_number
+  /// \param[in] handle CUDA context handler created by another library
+  /// \param[out] out shared context
+  Status CreateSharedContext(int device_number, void* handle,
+                             std::shared_ptr<CudaContext>* out);
+
   Status AllocateHost(int64_t nbytes, std::shared_ptr<CudaHostBuffer>* buffer);
 
   Status FreeHost(void* data, int64_t nbytes);
@@ -76,6 +83,13 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext
   /// \return Status
   Status Allocate(int64_t nbytes, std::shared_ptr<CudaBuffer>* out);
 
+  /// \brief Create a view of CUDA memory on GPU device of this context
+  /// \param[in] data the starting device address
+  /// \param[in] nbytes number of bytes
+  /// \param[out] out the view buffer
+  /// \return Status
+  Status View(uint8_t* data, int64_t nbytes, std::shared_ptr<CudaBuffer>* out);
+
   /// \brief Open existing CUDA IPC memory handle
   /// \param[in] ipc_handle opaque pointer to CUipcMemHandle (driver API)
   /// \param[out] buffer a CudaBuffer referencing
@@ -85,6 +99,9 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext
 
   int64_t bytes_allocated() const;
 
+  /// \brief Expose CUDA context handle to other libraries
+  const void* handle() const;
+
  private:
   CudaContext();
 

diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx
@@ -26,15 +26,17 @@ cdef class Context:
     """ CUDA driver context.
     """
 
-    def __cinit__(self, int device_number=0):
+    def __cinit__(self, int device_number=0, uintptr_t handle=0):
         """Construct the shared CUDA driver context for a particular device.
 
         Parameters
         ----------
         device_number : int
           Specify the gpu device for which the CUDA driver context is
           requested.
-
+        handle : void_p
+          Specify handle for a shared context that has been created by
+          another library.
         """
         cdef CCudaDeviceManager* manager
         check_status(CCudaDeviceManager.GetInstance(&manager))
@@ -43,9 +45,57 @@ cdef class Context:
             self.context.reset()
             raise ValueError('device_number argument must be '
                              'non-negative less than %s' % (n))
-        check_status(manager.GetContext(device_number, &self.context))
+        if handle == 0:
+            check_status(manager.GetContext(device_number, &self.context))
+        else:
+            check_status(manager.CreateSharedContext(device_number,
+                                                     <void*>handle,
+                                                     &self.context))
         self.device_number = device_number
 
+    @staticmethod
+    def from_numba(context=None):
+        """Create Context instance from a numba CUDA context.
+
+        Parameters
+        ----------
+        context : {numba.cuda.cudadrv.driver.Context, None}
+          Specify numba CUDA context instance. When None, use the
+          current numba context.
+
+        Returns
+        -------
+        shared_context : pyarrow.cuda.Context
+          Context instance.
+        """
+        if context is None:
+            import numba.cuda
+            context = numba.cuda.cudadrv.devices.get_context()
+        return Context(device_number=context.device.id,
+                       handle=context.handle.value)
+
+    def to_numba(self):
+        """Convert Context to numba CUDA context.
+
+        Returns
+        -------
+        context : numba.cuda.cudadrv.driver.Context
+          Numba CUDA context instance.
+        """
+        import ctypes
+        import numba.cuda
+        device = numba.cuda.gpus[self.device_number]
+        handle = ctypes.c_void_p(self.handle)
+        context = numba.cuda.cudadrv.driver.Context(device, handle)
+
+        class DummyPendingDeallocs(object):
+            # Context is management by pyarrow
+            def add_item(self, *args, **kwargs):
+                pass
+
+        context.deallocations = DummyPendingDeallocs()
+        return context
+
     @staticmethod
     def get_num_devices():
         """ Return the number of GPU devices.
@@ -60,6 +110,12 @@ cdef class Context:
         """
         return self.device_number
 
+    @property
+    def handle(self):
+        """ Return pointer to context handle.
+        """
+        return <uintptr_t>self.context.get().handle()
+
     cdef void init(self, const shared_ptr[CCudaContext]& ctx):
         self.context = ctx
 
@@ -86,6 +142,34 @@ cdef class Context:
         check_status(self.context.get().Allocate(nbytes, &cudabuf))
         return pyarrow_wrap_cudabuffer(cudabuf)
 
+    def foreign_buffer(self, address, size):
+        """Create device buffer from device address and size as a view.
+
+        The caller is responsible for allocating and freeing the
+        memory as well as ensureing that the memory belongs to the
+        CUDA context that this Context instance holds.
+
+        Parameters
+        ----------
+        address : intptr_t
+          Specify the starting address of the buffer.
+        size : int
+          Specify the size of device buffer in bytes.
+
+        Returns
+        -------
+        cbuf : CudaBuffer
+          Device buffer as a view of device memory.
+        """
+        cdef:
+            intptr_t c_addr = address
+            int64_t c_size = size
+            shared_ptr[CCudaBuffer] cudabuf
+        check_status(self.context.get().View(<uint8_t*>c_addr,
+                                             c_size,
+                                             &cudabuf))
+        return pyarrow_wrap_cudabuffer(cudabuf)
+
     def open_ipc_buffer(self, ipc_handle):
         """ Open existing CUDA IPC memory handle
 
@@ -232,6 +316,31 @@ cdef class CudaBuffer(Buffer):
         check_status(CCudaBuffer.FromBuffer(buf_, &cbuf))
         return pyarrow_wrap_cudabuffer(cbuf)
 
+    @staticmethod
+    def from_numba(mem):
+        """Create a CudaBuffer view from numba MemoryPointer instance.
+
+        Parameters
+        ----------
+        mem :  numba.cuda.cudadrv.driver.MemoryPointer
+
+        Returns
+        -------
+        cbuf : CudaBuffer
+          Device buffer as a view of numba MemoryPointer.
+        """
+        ctx = Context.from_numba(mem.context)
+        return ctx.foreign_buffer(mem.device_pointer.value, mem.size)
+
+    def to_numba(self):
+        """Return numba memory pointer of CudaBuffer instance.
+        """
+        import ctypes
+        from numba.cuda.cudadrv.driver import MemoryPointer
+        return MemoryPointer(self.context.to_numba(),
+                             pointer=ctypes.c_void_p(self.address),
+                             size=self.size)
+
     cdef getitem(self, int64_t i):
         return self.copy_to_host(position=i, nbytes=1)[0]
 

diff --git a/python/pyarrow/includes/libarrow_cuda.pxd b/python/pyarrow/includes/libarrow_cuda.pxd
@@ -27,6 +27,9 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil:
         CStatus GetContext(int gpu_number, shared_ptr[CCudaContext]* ctx)
         # CStatus CreateNewContext(int gpu_number,
         #                          shared_ptr[CCudaContext]* ctx)
+        CStatus CreateSharedContext(int gpu_number,
+                                    void* handle,
+                                    shared_ptr[CCudaContext]* ctx)
         CStatus AllocateHost(int64_t nbytes,
                              shared_ptr[CCudaHostBuffer]* buffer)
         # CStatus FreeHost(void* data, int64_t nbytes)
@@ -36,9 +39,13 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil:
         shared_ptr[CCudaContext]  shared_from_this()
         # CStatus Close()
         CStatus Allocate(int64_t nbytes, shared_ptr[CCudaBuffer]* out)
+        CStatus View(uint8_t* data,
+                     int64_t nbytes,
+                     shared_ptr[CCudaBuffer]* out)
         CStatus OpenIpcBuffer(const CCudaIpcMemHandle& ipc_handle,
                               shared_ptr[CCudaBuffer]* buffer)
         int64_t bytes_allocated() const
+        const void* handle() const
 
     cdef cppclass CCudaIpcMemHandle" arrow::gpu::CudaIpcMemHandle":
         @staticmethod

diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
@@ -203,7 +203,7 @@ def test_context_device_buffer():
     np.testing.assert_equal(arr[soffset:soffset+ssize], arr2)
 
     cudabuf = global_context.buffer_from_data(
-         buf.slice(offset=soffset, length=ssize))
+        buf.slice(offset=soffset, length=ssize))
     assert cudabuf.size == ssize
     arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8)
     np.testing.assert_equal(arr[soffset:soffset+ssize], arr2)