Skip to content

Commit 9b2df5e

Browse files
committed
Update based on latest feedback
1 parent 4b1de24 commit 9b2df5e

File tree

1 file changed

+77
-43
lines changed

1 file changed

+77
-43
lines changed

include/dlpack/dlpack.h

Lines changed: 77 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,8 @@ typedef int (*DLPackManagedTensorAllocator)( //
397397
* This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
398398
* to a DLManagedTensorVersioned without going through the Python interpreter.
399399
*
400-
* It also provides an option to query the current context stream of the device provided
401-
* by the tensor.
400+
* This function does not perform any stream synchronization. The consumer should query
401+
* DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
402402
*
403403
* This function is exposed by the framework through the DLPackExchangeAPI.
404404
*
@@ -410,51 +410,83 @@ typedef int (*DLPackManagedTensorAllocator)( //
410410
* We use void* to avoid dependency on Python.h.
411411
*
412412
* \param out The output DLManagedTensorVersioned.
413+
* \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
414+
* \note We use void* to avoid dependency on Python.h, so this specific type is
415+
* not dependent on Python.h and can be copied to dlpack.h.
416+
*
417+
* \sa DLPackExchangeAPI, DLPackCurrentWorkStream
418+
*/
419+
typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
420+
void* py_object, //
421+
DLManagedTensorVersioned** out //
422+
);
423+
424+
/*!
425+
* \brief Exports a PyObject* Tensor/NDArray to a DLTensor whose space is pre-allocated on stack.
426+
*
427+
* This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
428+
* to a DLTensor whose space is pre-allocated on stack without going through the Python interpreter.
429+
*
430+
* This is an non-owning conversion, the producer still owns the memory of data, strides, shape.
431+
* The liveness of DLTensor is only guaranteed until the consumer returns control to the caller.
432+
*
433+
* In the context of this function, we expect the producer to allocated space for data, strides and shape.
434+
*
435+
* This function does not perform any stream synchronization. The consumer should query
436+
* DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
437+
*
438+
* This function is useful when the consumer do not need to retain the tensor memory.
439+
* It generally can provide about 2x faster conversion than DLPackManagedTensorFromPyObjectNoSync.
413440
*
414-
* \param optional_out_last_active_stream Outputs the current stream the tensor is synced to.
415-
* It can be NULL, in which case the stream will not be queried.
416-
* optional_out_last_active_stream should point to cudaStream_t in the case of CUDA.
417-
* Note that for frameworks that use a stream context manager, optional_out_last_active_stream
418-
* can be the stream that the context manager was most recently active on.
419-
* The stream is owned by the producer, and the consumer cannot retain it.
420-
* Instead, the consumer can record an event or add wait dependencies to it.
421-
* It is the responsibility of the consumer to synchronize with the stream if necessary.
422-
* The producer may output `reinterpret_cast<void*>(-1)` to indicate that the last active stream
423-
* is not available; in such a case, a device sync is needed to ensure data is ready.
441+
* For cases where consumer may needs to reorganize the tensor memory via temporary managed copy,
442+
* DLPackManagedTensorFromPyObjectNoSync should be used.
424443
*
444+
* This function is exposed by the framework through the DLPackExchangeAPI.
445+
*
446+
* This information can then be picked up by importers and libraries to perform a fast conversion.
447+
* This function should not throw any exceptions; if it fails, it should return -1 and
448+
* set the error message via PyErr_SetXXX.
449+
*
450+
* \param py_object The Python object to convert; this should be PyObject*.
451+
* We use void* to avoid dependency on Python.h.
452+
*
453+
* \param out The output DLTensor, whose space is pre-allocated on stack.
425454
* \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
426455
* \note We use void* to avoid dependency on Python.h, so this specific type is
427456
* not dependent on Python.h and can be copied to dlpack.h.
428457
*
429458
* \sa DLPackExchangeAPI, DLPackCurrentWorkStream
430459
*/
431-
typedef int (*DLPackManagedTensorFromPyObject)( //
432-
void* py_object, //
433-
DLManagedTensorVersioned** out, //
434-
void** optional_out_last_active_stream //
460+
typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
461+
void* py_object, //
462+
DLTensor* out //
435463
);
436464

437465
/*!
438466
* \brief Obtain the current work stream of a device.
439467
*
440-
* This function is a C-style function pointer to obtain the current work stream of a device
441-
* for frameworks that rely on a context manager to manage the stream.
468+
* This function is a C-style function pointer to obtain the current work stream
469+
* of a device for frameworks that rely on a context manager to manage the stream.
442470
* For example, it should map to torch.cuda.current_stream in PyTorch.
443471
*
444-
* This function can be set to NULL if the framework does not rely on a context manager to
445-
* manage the stream.
472+
* This function can be set to NULL if the framework does not rely on a context manager
473+
* to manage the stream. However, we encourage frameworks to provide this function
474+
* if possible.
475+
*
476+
* As if this field is not set, likely consumer cannot safely do stream based
477+
* exchange based on the
446478
*
447479
* \param device_type The device type.
448480
* \param device_id The device id.
449-
* \param optional_out_current_stream The output current work stream.
481+
* \param out_current_stream The output current work stream.
450482
* \return 0 on success, -1 on failure.
451483
*
452484
* \sa DLPackExchangeAPI
453485
*/
454486
typedef int (*DLPackCurrentWorkStream)( //
455-
DLDevice device_type, //
456-
DLDevice device_id, //
457-
void** optional_out_current_stream //
487+
DLDeviceType device_type, //
488+
int32_t device_id, //
489+
void** out_current_stream //
458490
);
459491

460492
/*!
@@ -463,6 +495,8 @@ typedef int (*DLPackCurrentWorkStream)( //
463495
* This function is a C-style function pointer to quickly convert a DLManagedTensorVersioned
464496
* to a PyObject* without going through the Python Interpreter.
465497
*
498+
* This function does not perform any stream synchronization.
499+
*
466500
* This function is exposed by the framework through the DLPackExchangeAPI.
467501
*
468502
* \param tensor The DLManagedTensorVersioned to convert.
@@ -473,8 +507,8 @@ typedef int (*DLPackCurrentWorkStream)( //
473507
*
474508
* \sa DLPackExchangeAPI
475509
*/
476-
typedef int (*DLPackManagedTensorToPyObject)( //
477-
DLManagedTensorVersioned* tensor, void** out_py_object //
510+
typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
511+
DLManagedTensorVersioned* tensor, void** out_py_object //
478512
);
479513

480514
/*!
@@ -486,22 +520,16 @@ typedef int (*DLPackManagedTensorToPyObject)( //
486520
* - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
487521
* with the data from x, y, z. The consumer is also expected to run the kernel with the same
488522
* stream context as the producer. For example, when x, y, z is torch.Tensor,
489-
* consumer should query exchange_api->optional_current_work_stream to get the
523+
* consumer should query exchange_api->current_work_stream to get the
490524
* current stream and launch the kernel with the same stream.
491525
* This setup is necessary for no synchronization in kernel launch and maximum compatibility
492526
* with CUDA graph capture in the producer.
493527
* This is the desirable behavior for library extension support for frameworks like PyTorch.
494-
* - N1: data ingestion and retention, in such a case, the consumer is interested in obtaining
495-
* the data from the producer and runs further computation on its own stream.
496-
* In such a case, the consumer can directly query optional_last_active_stream to
497-
* get the last active stream and record a dependency.
528+
* - N1: data ingestion and retention
498529
*
499-
* Consumer should consider their needs (N0 or N1) and act accordingly based on the
500-
* availability of the function pointer.
501-
*
502-
* Importantly, optional_current_work_stream may be NULL for frameworks that
503-
* do not rely on a context manager to manage the stream, in which case the consumer
504-
* should rely on the information in optional_last_active_stream.
530+
* Note that obj.__dlpack__() API should provide useful ways for N1.
531+
* The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
532+
* with the support of the function pointer current_work_stream.
505533
*
506534
* Array/Tensor libraries should statically create and initialize this structure
507535
* then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
@@ -518,9 +546,10 @@ typedef int (*DLPackManagedTensorToPyObject)( //
518546
* version.major = DLPACK_MAJOR_VERSION;
519547
* version.minor = DLPACK_MINOR_VERSION;
520548
* managed_tensor_allocator = MyDLPackManagedTensorAllocator;
521-
* managed_tensor_from_py_object = MyDLPackManagedTensorFromPyObject;
522-
* managed_tensor_to_py_object = MyDLPackManagedTensorToPyObject
523-
* optional_current_work_stream = MyDLPackCurrentWorkStream;
549+
* managed_tensor_from_py_object_no_sync = MyDLPackManagedTensorFromPyObjectNoSync;
550+
* managed_tensor_to_py_object_no_sync = MyDLPackManagedTensorToPyObjectNoSync;
551+
* dltensor_from_py_object_no_sync = MyDLPackDLTensorFromPyObjectNoSync;
552+
* current_work_stream = MyDLPackCurrentWorkStream;
524553
* prev_version_api = nullptr;
525554
* }
526555
*
@@ -564,20 +593,25 @@ struct DLPackExchangeAPI {
564593
* \brief Framework-specific function pointer for DLPackManagedTensorFromPyObject
565594
* \sa DLPackManagedTensorFromPyObject
566595
*/
567-
DLPackManagedTensorFromPyObject managed_tensor_from_py_object;
596+
DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
568597
/*!
569598
* \brief Framework-specific function pointer for DLPackManagedTensorToPyObject
570599
* \sa DLPackManagedTensorToPyObject
571600
*/
572-
DLPackManagedTensorToPyObject managed_tensor_to_py_object;
601+
DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
602+
/*!
603+
* \brief Framework-specific function pointer for DLPackDLTensorFromPyObject
604+
* \sa DLPackDLTensorFromPyObjectNoSync
605+
*/
606+
DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
573607
/*!
574608
* \brief Framework-specific function pointer for DLPackCurrentWorkStream
575609
*
576610
* This function can be set to NULL if the framework does not rely on context manager to manage the stream.
577611
*
578612
* \sa DLPackCurrentWorkStream
579613
*/
580-
DLPackCurrentWorkStream optional_current_work_stream;
614+
DLPackCurrentWorkStream current_work_stream;
581615
};
582616

583617
#ifdef __cplusplus

0 commit comments

Comments
 (0)