Skip to content

Implement stream priority feature #321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions doc/driver.rst
Original file line number Diff line number Diff line change
@@ -331,7 +331,7 @@ Constants
CUDA 6.0 and above.
.. versionadded:: 2014.1
.. attribute :: HOST_NATIVE_ATOMIC_SUPPORTED
SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
PAGEABLE_MEMORY_ACCESS
@@ -644,6 +644,10 @@ Devices and Contexts

See also :mod:`pycuda.autoinit`.

.. function:: get_stream_priority_range()

Returns numerical values that correspond to the least and greatest stream priorities.

.. class:: Device(number)
Device(pci_bus_id)

@@ -813,7 +817,7 @@ Devices and Contexts
Concurrency and Streams
-----------------------

.. class:: Stream(flags=0)
.. class:: Stream(flags=0, priority=0)

A handle for a queue of operations that will be carried out in order.

26 changes: 23 additions & 3 deletions src/cpp/cuda.hpp
Original file line number Diff line number Diff line change
@@ -531,7 +531,6 @@ namespace pycuda
* to push contexts that are already active at a deeper stack level, so we
* maintain all contexts floating other than the top one.
*/

// for friend decl
namespace gl {
boost::shared_ptr<context>
@@ -862,6 +861,18 @@ namespace pycuda
return result;
}

#if CUDAPP_CUDA_VERSION >= 7500
inline
py::tuple get_stream_priority_range()
{
int leastPriority;
int greatestPriority;
CUDAPP_CALL_GUARDED(cuCtxGetStreamPriorityRange, (&leastPriority, &greatestPriority));
return py::make_tuple(leastPriority, greatestPriority);
}
#endif



#if CUDAPP_CUDA_VERSION >= 7000
inline boost::shared_ptr<context> device::retain_primary_context()
@@ -997,8 +1008,17 @@ namespace pycuda
CUstream m_stream;

public:
stream(unsigned int flags=0)
{ CUDAPP_CALL_GUARDED(cuStreamCreate, (&m_stream, flags)); }

#if CUDAPP_CUDA_VERSION >= 7500
stream(unsigned int flags=0, int priority=0)
{ CUDAPP_CALL_GUARDED(cuStreamCreateWithPriority, (&m_stream, flags, priority)); }
#else
if (priority != 0)
throw pycuda::error("stream", CUDA_ERROR_INVALID_HANDLE,
"priority!=0 setting isn't supported for your CUDA version");
stream(unsigned int flags=0)
{ CUDAPP_CALL_GUARDED(cuStreamCreate, (&m_stream, flags)); }
#endif

~stream()
{
5 changes: 4 additions & 1 deletion src/wrapper/wrap_cudadrv.cpp
Original file line number Diff line number Diff line change
@@ -1193,13 +1193,16 @@ BOOST_PYTHON_MODULE(_driver)
.add_property("handle", &cl::handle_int)
;
}

DEF_SIMPLE_FUNCTION(get_stream_priority_range);

// }}}

// {{{ stream
{
typedef stream cl;
py::class_<cl, boost::noncopyable, shared_ptr<cl> >
("Stream", py::init<unsigned int>(py::arg("flags")=0))
("Stream", py::init<unsigned int, int>(py::arg("flags")=0, py::arg("priority")=0))
.DEF_SIMPLE_METHOD(synchronize)
.DEF_SIMPLE_METHOD(is_done)
#if CUDAPP_CUDA_VERSION >= 3020
23 changes: 23 additions & 0 deletions test/test_driver.py
Original file line number Diff line number Diff line change
@@ -935,6 +935,29 @@ def test_register_host_memory(self):
drv.memcpy_htod_async(gpu_ary, a_pin, stream)
drv.Context.synchronize()

@mark_cuda_test
def test_stream_priority_setting(self):
if drv.get_version() < (4,):
from py.test import skip

skip("register_host_memory only exists on CUDA 4.0 and later")

import sys

if sys.platform == "darwin":
from py.test import skip

skip("register_host_memory is not supported on OS X")

a = drv.aligned_empty((2 ** 20,), np.float64)
a_pin = drv.register_host_memory(a)

gpu_ary = drv.mem_alloc_like(a)
min_priority, max_priority = drv.get_stream_priority_range()
stream = drv.Stream(priority=np.random.choice(range(min_priority, max_priority)))
drv.memcpy_htod_async(gpu_ary, a_pin, stream)
drv.Context.synchronize()

@mark_cuda_test
# https://github.com/inducer/pycuda/issues/45
def test_recursive_launch(self):