@@ -397,8 +397,8 @@ typedef int (*DLPackManagedTensorAllocator)( //
397397 * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
398398 * to a DLManagedTensorVersioned without going through the Python interpreter.
399399 *
400- * It also provides an option to query the current context stream of the device provided
401- * by the tensor .
400+ * This function does not perform any stream synchronization. The consumer should query
401+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it .
402402 *
403403 * This function is exposed by the framework through the DLPackExchangeAPI.
404404 *
@@ -410,51 +410,83 @@ typedef int (*DLPackManagedTensorAllocator)( //
410410 * We use void* to avoid dependency on Python.h.
411411 *
412412 * \param out The output DLManagedTensorVersioned.
413+ * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
414+ * \note We use void* to avoid dependency on Python.h, so this specific type is
415+ * not dependent on Python.h and can be copied to dlpack.h.
416+ *
417+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
418+ */
419+ typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
420+ void * py_object, //
421+ DLManagedTensorVersioned** out //
422+ );
423+
424+ /* !
425+ * \brief Exports a PyObject* Tensor/NDArray to a DLTensor whose space is pre-allocated on stack.
426+ *
427+ * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
428+ * to a DLTensor whose space is pre-allocated on stack without going through the Python interpreter.
429+ *
430+ * This is an non-owning conversion, the producer still owns the memory of data, strides, shape.
431+ * The liveness of DLTensor is only guaranteed until the consumer returns control to the caller.
432+ *
433+ * In the context of this function, we expect the producer to allocated space for data, strides and shape.
434+ *
435+ * This function does not perform any stream synchronization. The consumer should query
436+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
437+ *
438+ * This function is useful when the consumer do not need to retain the tensor memory.
439+ * It generally can provide about 2x faster conversion than DLPackManagedTensorFromPyObjectNoSync.
413440 *
414- * \param optional_out_last_active_stream Outputs the current stream the tensor is synced to.
415- * It can be NULL, in which case the stream will not be queried.
416- * optional_out_last_active_stream should point to cudaStream_t in the case of CUDA.
417- * Note that for frameworks that use a stream context manager, optional_out_last_active_stream
418- * can be the stream that the context manager was most recently active on.
419- * The stream is owned by the producer, and the consumer cannot retain it.
420- * Instead, the consumer can record an event or add wait dependencies to it.
421- * It is the responsibility of the consumer to synchronize with the stream if necessary.
422- * The producer may output `reinterpret_cast<void*>(-1)` to indicate that the last active stream
423- * is not available; in such a case, a device sync is needed to ensure data is ready.
441+ * For cases where consumer may needs to reorganize the tensor memory via temporary managed copy,
442+ * DLPackManagedTensorFromPyObjectNoSync should be used.
424443 *
444+ * This function is exposed by the framework through the DLPackExchangeAPI.
445+ *
446+ * This information can then be picked up by importers and libraries to perform a fast conversion.
447+ * This function should not throw any exceptions; if it fails, it should return -1 and
448+ * set the error message via PyErr_SetXXX.
449+ *
450+ * \param py_object The Python object to convert; this should be PyObject*.
451+ * We use void* to avoid dependency on Python.h.
452+ *
453+ * \param out The output DLTensor, whose space is pre-allocated on stack.
425454 * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
426455 * \note We use void* to avoid dependency on Python.h, so this specific type is
427456 * not dependent on Python.h and can be copied to dlpack.h.
428457 *
429458 * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
430459 */
431- typedef int (*DLPackManagedTensorFromPyObject)( //
432- void * py_object, //
433- DLManagedTensorVersioned** out, //
434- void ** optional_out_last_active_stream //
460+ typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
461+ void * py_object, //
462+ DLTensor* out //
435463);
436464
437465/* !
438466 * \brief Obtain the current work stream of a device.
439467 *
440- * This function is a C-style function pointer to obtain the current work stream of a device
441- * for frameworks that rely on a context manager to manage the stream.
468+ * This function is a C-style function pointer to obtain the current work stream
469+ * of a device for frameworks that rely on a context manager to manage the stream.
442470 * For example, it should map to torch.cuda.current_stream in PyTorch.
443471 *
444- * This function can be set to NULL if the framework does not rely on a context manager to
445- * manage the stream.
472+ * This function can be set to NULL if the framework does not rely on a context manager
473+ * to manage the stream. However, we encourage frameworks to provide this function
474+ * if possible.
475+ *
476+ * As if this field is not set, likely consumer cannot safely do stream based
477+ * exchange based on the
446478 *
447479 * \param device_type The device type.
448480 * \param device_id The device id.
449- * \param optional_out_current_stream The output current work stream.
481+ * \param out_current_stream The output current work stream.
450482 * \return 0 on success, -1 on failure.
451483 *
452484 * \sa DLPackExchangeAPI
453485 */
454486typedef int (*DLPackCurrentWorkStream)( //
455- DLDevice device_type, //
456- DLDevice device_id, //
457- void ** optional_out_current_stream //
487+ DLDeviceType device_type, //
488+ int32_t device_id, //
489+ void ** out_current_stream //
458490);
459491
460492/* !
@@ -463,6 +495,8 @@ typedef int (*DLPackCurrentWorkStream)( //
463495 * This function is a C-style function pointer to quickly convert a DLManagedTensorVersioned
464496 * to a PyObject* without going through the Python Interpreter.
465497 *
498+ * This function does not perform any stream synchronization.
499+ *
466500 * This function is exposed by the framework through the DLPackExchangeAPI.
467501 *
468502 * \param tensor The DLManagedTensorVersioned to convert.
@@ -473,8 +507,8 @@ typedef int (*DLPackCurrentWorkStream)( //
473507 *
474508 * \sa DLPackExchangeAPI
475509 */
476- typedef int (*DLPackManagedTensorToPyObject )( //
477- DLManagedTensorVersioned* tensor, void ** out_py_object //
510+ typedef int (*DLPackManagedTensorToPyObjectNoSync )( //
511+ DLManagedTensorVersioned* tensor, void ** out_py_object //
478512);
479513
480514/* !
@@ -486,22 +520,16 @@ typedef int (*DLPackManagedTensorToPyObject)( //
486520 * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
487521 * with the data from x, y, z. The consumer is also expected to run the kernel with the same
488522 * stream context as the producer. For example, when x, y, z is torch.Tensor,
489- * consumer should query exchange_api->optional_current_work_stream to get the
523+ * consumer should query exchange_api->current_work_stream to get the
490524 * current stream and launch the kernel with the same stream.
491525 * This setup is necessary for no synchronization in kernel launch and maximum compatibility
492526 * with CUDA graph capture in the producer.
493527 * This is the desirable behavior for library extension support for frameworks like PyTorch.
494- * - N1: data ingestion and retention, in such a case, the consumer is interested in obtaining
495- * the data from the producer and runs further computation on its own stream.
496- * In such a case, the consumer can directly query optional_last_active_stream to
497- * get the last active stream and record a dependency.
528+ * - N1: data ingestion and retention
498529 *
499- * Consumer should consider their needs (N0 or N1) and act accordingly based on the
500- * availability of the function pointer.
501- *
502- * Importantly, optional_current_work_stream may be NULL for frameworks that
503- * do not rely on a context manager to manage the stream, in which case the consumer
504- * should rely on the information in optional_last_active_stream.
530+ * Note that obj.__dlpack__() API should provide useful ways for N1.
531+ * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
532+ * with the support of the function pointer current_work_stream.
505533 *
506534 * Array/Tensor libraries should statically create and initialize this structure
507535 * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
@@ -518,9 +546,10 @@ typedef int (*DLPackManagedTensorToPyObject)( //
518546 * version.major = DLPACK_MAJOR_VERSION;
519547 * version.minor = DLPACK_MINOR_VERSION;
520548 * managed_tensor_allocator = MyDLPackManagedTensorAllocator;
521- * managed_tensor_from_py_object = MyDLPackManagedTensorFromPyObject;
522- * managed_tensor_to_py_object = MyDLPackManagedTensorToPyObject
523- * optional_current_work_stream = MyDLPackCurrentWorkStream;
549+ * managed_tensor_from_py_object_no_sync = MyDLPackManagedTensorFromPyObjectNoSync;
550+ * managed_tensor_to_py_object_no_sync = MyDLPackManagedTensorToPyObjectNoSync;
551+ * dltensor_from_py_object_no_sync = MyDLPackDLTensorFromPyObjectNoSync;
552+ * current_work_stream = MyDLPackCurrentWorkStream;
524553 * prev_version_api = nullptr;
525554 * }
526555 *
@@ -564,20 +593,25 @@ struct DLPackExchangeAPI {
564593 * \brief Framework-specific function pointer for DLPackManagedTensorFromPyObject
565594 * \sa DLPackManagedTensorFromPyObject
566595 */
567- DLPackManagedTensorFromPyObject managed_tensor_from_py_object ;
596+ DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync ;
568597 /* !
569598 * \brief Framework-specific function pointer for DLPackManagedTensorToPyObject
570599 * \sa DLPackManagedTensorToPyObject
571600 */
572- DLPackManagedTensorToPyObject managed_tensor_to_py_object;
601+ DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
602+ /* !
603+ * \brief Framework-specific function pointer for DLPackDLTensorFromPyObject
604+ * \sa DLPackDLTensorFromPyObjectNoSync
605+ */
606+ DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
573607 /* !
574608 * \brief Framework-specific function pointer for DLPackCurrentWorkStream
575609 *
576610 * This function can be set to NULL if the framework does not rely on context manager to manage the stream.
577611 *
578612 * \sa DLPackCurrentWorkStream
579613 */
580- DLPackCurrentWorkStream optional_current_work_stream ;
614+ DLPackCurrentWorkStream current_work_stream ;
581615};
582616
583617#ifdef __cplusplus
0 commit comments