Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python wrapper for system memory resource #1605

Merged
merged 16 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

namespace rmm::mr {
/**
* @addtogroup device_resource_adaptors
* @addtogroup device_memory_resources
* @{
* @file
*/
/**
* @brief Resource that adapts system memory resource to allocate memory with a headroom.
* @brief Resource that uses system memory resource to allocate memory with a headroom.
*
* System allocated memory (SAM) can be migrated to the GPU, but is never migrated back the host. If
* GPU memory is over-subscribed, this can cause other CUDA calls to fail with out-of-memory errors.
Expand All @@ -39,46 +39,22 @@ namespace rmm::mr {
* Since doing this check on every allocation can be expensive, the caller may choose to use other
* allocators (e.g. `binning_memory_resource`) for small allocations, and use this allocator for
* large allocations only.
*
* @tparam Upstream Type of the upstream resource used for allocation/deallocation. Must be
* `system_memory_resource`.
*/
template <typename Upstream>
class sam_headroom_resource_adaptor final : public device_memory_resource {
class sam_headroom_memory_resource final : public device_memory_resource {
public:
/**
* @brief Construct a headroom adaptor using `upstream` to satisfy allocation requests.
* @brief Construct a headroom memory resource.
*
* @param upstream The resource used for allocating/deallocating device memory. Must be
* `system_memory_resource`.
* @param headroom Size of the reserved GPU memory as headroom
*/
explicit sam_headroom_resource_adaptor(Upstream* upstream, std::size_t headroom)
: upstream_{upstream}, headroom_{headroom}
{
static_assert(std::is_same_v<system_memory_resource, Upstream>,
"Upstream must be rmm::mr::system_memory_resource");
}
explicit sam_headroom_memory_resource(std::size_t headroom) : system_mr_{}, headroom_{headroom} {}

sam_headroom_resource_adaptor() = delete;
~sam_headroom_resource_adaptor() override = default;
sam_headroom_resource_adaptor(sam_headroom_resource_adaptor const&) = delete;
sam_headroom_resource_adaptor(sam_headroom_resource_adaptor&&) = delete;
sam_headroom_resource_adaptor& operator=(sam_headroom_resource_adaptor const&) = delete;
sam_headroom_resource_adaptor& operator=(sam_headroom_resource_adaptor&&) = delete;

/**
* @briefreturn{rmm::device_async_resource_ref to the upstream resource}
*/
[[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept
{
return upstream_;
}

/**
* @briefreturn{Upstream* to the upstream memory resource}
*/
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
sam_headroom_memory_resource() = delete;
~sam_headroom_memory_resource() override = default;
sam_headroom_memory_resource(sam_headroom_memory_resource const&) = delete;
sam_headroom_memory_resource(sam_headroom_memory_resource&&) = delete;
sam_headroom_memory_resource& operator=(sam_headroom_memory_resource const&) = delete;
sam_headroom_memory_resource& operator=(sam_headroom_memory_resource&&) = delete;

private:
/**
Expand All @@ -94,8 +70,7 @@ class sam_headroom_resource_adaptor final : public device_memory_resource {
*/
void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override
{
void* pointer =
get_upstream_resource().allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
void* pointer = system_mr_.allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);

auto const free = rmm::available_device_memory().first;
auto const allocatable = free > headroom_ ? free - headroom_ : 0UL;
Expand Down Expand Up @@ -131,7 +106,7 @@ class sam_headroom_resource_adaptor final : public device_memory_resource {
[[maybe_unused]] std::size_t bytes,
[[maybe_unused]] cuda_stream_view stream) override
{
get_upstream_resource().deallocate_async(ptr, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
system_mr_.deallocate_async(ptr, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
}

/**
Expand All @@ -144,13 +119,15 @@ class sam_headroom_resource_adaptor final : public device_memory_resource {
[[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
{
if (this == &other) { return true; }
auto cast = dynamic_cast<sam_headroom_resource_adaptor const*>(&other);
auto cast = dynamic_cast<sam_headroom_memory_resource const*>(&other);
if (cast == nullptr) { return false; }
return get_upstream_resource() == cast->get_upstream_resource() && headroom_ == cast->headroom_;
return headroom_ == cast->headroom_;
}

Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests
std::size_t headroom_; ///< Size of GPU memory reserved as headroom
///< The system memory resource used for satisfying allocation requests
system_memory_resource system_mr_;
///< Size of GPU memory reserved as headroom
std::size_t headroom_;
};
/** @} */ // end of group
} // namespace rmm::mr
3 changes: 3 additions & 0 deletions python/rmm/rmm/_lib/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ cdef class CudaMemoryResource(DeviceMemoryResource):
cdef class ManagedMemoryResource(DeviceMemoryResource):
pass

cdef class SystemMemoryResource(DeviceMemoryResource):
pass

cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
pass

Expand Down
42 changes: 42 additions & 0 deletions python/rmm/rmm/_lib/memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
cdef cppclass managed_memory_resource(device_memory_resource):
managed_memory_resource() except +

cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass system_memory_resource(device_memory_resource):
system_memory_resource() except +

cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass sam_headroom_memory_resource(device_memory_resource):
sam_headroom_memory_resource(size_t headroom) except +

cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
namespace "rmm::mr" nogil:

Expand Down Expand Up @@ -366,6 +376,38 @@ cdef class ManagedMemoryResource(DeviceMemoryResource):
pass


cdef class SystemMemoryResource(DeviceMemoryResource):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
def __cinit__(
self,
headroom=None
):
if headroom is None:
self.c_obj.reset(
new system_memory_resource()
)
else:
self.c_obj.reset(
new sam_headroom_memory_resource(
<size_t> headroom
)
)

def __init__(
self,
headroom=None
):
"""
Memory resource that uses ``malloc``/``free`` for
allocation/deallocation.

Parameters
----------
headroom : size_t
Size of the reserved GPU memory as headroom
"""
pass


cdef class PoolMemoryResource(UpstreamResourceAdaptor):

def __cinit__(
Expand Down
2 changes: 2 additions & 0 deletions python/rmm/rmm/mr.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
PoolMemoryResource,
PrefetchResourceAdaptor,
StatisticsResourceAdaptor,
SystemMemoryResource,
TrackingResourceAdaptor,
UpstreamResourceAdaptor,
_flush_logs,
Expand Down Expand Up @@ -55,6 +56,7 @@
"PoolMemoryResource",
"PrefetchResourceAdaptor",
"StatisticsResourceAdaptor",
"SystemMemoryResource",
"TrackingResourceAdaptor",
"FailureCallbackResourceAdaptor",
"UpstreamResourceAdaptor",
Expand Down
57 changes: 55 additions & 2 deletions python/rmm/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
_runtime_version >= 11020
)

_SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
rmm._cuda.gpu.getDevice(),
)


def array_tester(dtype, nelem, alloc):
# data
Expand Down Expand Up @@ -91,6 +96,38 @@ def test_rmm_modes(dtype, nelem, alloc, managed, pool):
array_tester(dtype, nelem, alloc)


@pytest.mark.skipif(
not _SYSTEM_MEMORY_SUPPORTED,
reason="System memory not supported",
)
@pytest.mark.parametrize("dtype", _dtypes)
@pytest.mark.parametrize("nelem", _nelems)
@pytest.mark.parametrize("alloc", _allocs)
@pytest.mark.parametrize(
"system, pool, headroom",
list(product([False, True], [False, True], [False, True])),
)
def test_rmm_modes_system_memory(dtype, nelem, alloc, system, pool, headroom):
assert rmm.is_initialized()
array_tester(dtype, nelem, alloc)

if system:
base_mr = rmm.mr.SystemMemoryResource(
headroom=1 << 20 if headroom else None
)
else:
base_mr = rmm.mr.CudaMemoryResource()
if pool:
mr = rmm.mr.PoolMemoryResource(base_mr)
else:
mr = base_mr
rmm.mr.set_current_device_resource(mr)

assert rmm.is_initialized()

array_tester(dtype, nelem, alloc)


@pytest.mark.parametrize("dtype", _dtypes)
@pytest.mark.parametrize("nelem", _nelems)
@pytest.mark.parametrize("alloc", _allocs)
Expand Down Expand Up @@ -410,7 +447,15 @@ def test_pool_memory_resource(dtype, nelem, alloc):
[
lambda: rmm.mr.CudaMemoryResource(),
lambda: rmm.mr.ManagedMemoryResource(),
],
]
+ (
[
lambda: rmm.mr.SystemMemoryResource(),
lambda: rmm.mr.SystemMemoryResource(headroom=1 << 20),
]
if _SYSTEM_MEMORY_SUPPORTED
else []
),
)
def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream):
mr = rmm.mr.FixedSizeMemoryResource(
Expand All @@ -432,7 +477,15 @@ def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream):
lambda: rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(), 1 << 20
),
],
]
+ (
[
lambda: rmm.mr.SystemMemoryResource(),
lambda: rmm.mr.SystemMemoryResource(headroom=1 << 20),
]
if _SYSTEM_MEMORY_SUPPORTED
else []
),
)
def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
upstream = upstream_mr()
Expand Down
39 changes: 12 additions & 27 deletions tests/mr/device/system_mr_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include <rmm/cuda_device.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/sam_headroom_resource_adaptor.hpp>
#include <rmm/mr/device/sam_headroom_memory_resource.hpp>
#include <rmm/mr/device/system_memory_resource.hpp>

#include <gtest/gtest.h>
Expand Down Expand Up @@ -54,9 +54,9 @@ void touch_on_gpu(void* ptr, std::size_t size)
using system_mr = rmm::mr::system_memory_resource;
static_assert(cuda::mr::resource_with<system_mr, cuda::mr::device_accessible>);
static_assert(cuda::mr::async_resource_with<system_mr, cuda::mr::device_accessible>);
using headroom_adaptor = rmm::mr::sam_headroom_resource_adaptor<rmm::mr::system_memory_resource>;
static_assert(cuda::mr::resource_with<headroom_adaptor, cuda::mr::device_accessible>);
static_assert(cuda::mr::async_resource_with<headroom_adaptor, cuda::mr::device_accessible>);
using headroom_mr = rmm::mr::sam_headroom_memory_resource;
static_assert(cuda::mr::resource_with<headroom_mr, cuda::mr::device_accessible>);
static_assert(cuda::mr::async_resource_with<headroom_mr, cuda::mr::device_accessible>);

class SystemMRTest : public ::testing::Test {
protected:
Expand All @@ -79,19 +79,6 @@ TEST(SystemMRSimpleTest, ThrowIfNotSupported)
}
}

TEST(SAMHeadroomAdaptorTest, ThrowIfNotSupported)
{
auto construct_mr = []() {
system_mr mr;
headroom_adaptor adaptor{&mr, 0};
};
if (rmm::mr::detail::is_system_memory_supported(rmm::get_current_cuda_device())) {
EXPECT_NO_THROW(construct_mr());
} else {
EXPECT_THROW(construct_mr(), rmm::logic_error);
}
}

TEST_F(SystemMRTest, FirstTouchOnCPU)
{
auto const free = rmm::available_device_memory().first;
Expand All @@ -114,23 +101,21 @@ TEST_F(SystemMRTest, FirstTouchOnGPU)
mr.deallocate(ptr, size_mb);
}

TEST_F(SystemMRTest, AdaptorReserveAllFreeMemory)
TEST_F(SystemMRTest, HeadroomMRReserveAllFreeMemory)
{
auto const free = rmm::available_device_memory().first;
system_mr mr;
// All the free GPU memory is set as headroom, so allocation is only on the CPU.
headroom_adaptor adaptor{&mr, free + size_gb};
void* ptr = adaptor.allocate(size_mb);
headroom_mr mr{free + size_gb};
void* ptr = mr.allocate(size_mb);
touch_on_cpu(ptr, size_mb);
adaptor.deallocate(ptr, size_mb);
mr.deallocate(ptr, size_mb);
}

TEST_F(SystemMRTest, AdaptorDifferentParametersUnequal)
TEST_F(SystemMRTest, HeadroomMRDifferentParametersUnequal)
{
system_mr mr;
headroom_adaptor adaptor1{&mr, size_mb};
headroom_adaptor adaptor2{&mr, size_gb};
EXPECT_FALSE(adaptor1.is_equal(adaptor2));
headroom_mr mr1{size_mb};
headroom_mr mr2{size_gb};
EXPECT_FALSE(mr1.is_equal(mr2));
}
} // namespace
} // namespace rmm::test
Loading