Regards #291, #322: Added a unique_span class - a uniquely-owned sp…

…an. It should provide the most flexibility in our example programs (and typical user programs) for direct use without explicit casting, re-specifying types and sizes, etc.
eyalroz · Feb 13, 2024 · fe58fa5 · fe58fa5
1 parent 7a41a7e
commit fe58fa5
Show file tree

Hide file tree

Showing 6 changed files with 412 additions and 20 deletions.
diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -29,12 +29,12 @@ __global__ void increment_kernel(datum*g_data, datum inc_value)
 	g_data[global_idx] = g_data[global_idx] + inc_value;
 }
 
-bool correct_output(int *data, const int n, const int x)
+bool correct_output(cuda::span<const int> data, const int x)
 {
-	for (int i = 0; i < n; i++)
+	for (size_t i = 0; i < data.size(); i++)
 		if (data[i] != x)
 		{
-			printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+			printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
 			return false;
 		}
 	return true;
@@ -51,15 +51,14 @@ int main(int, char **)
 
 	std::cout << "CUDA device [" <<  device.name() << "]\n";
 
-	int n = 16 * 1024 * 1024;
-	int num_bytes = n * sizeof(datum);
+	const int n = 16 * 1024 * 1024;
 	int value = 26;
 
 	// allocate host memory
-	auto a = cuda::memory::host::make_unique<datum[]>(n);
-	cuda::memory::host::zero(a.get(), num_bytes);
+	auto a = cuda::memory::host::make_unique_span<datum>(n);
+	cuda::memory::host::zero(a);
 
-	auto d_a = cuda::memory::make_unique<datum[]>(device, n);
+	auto d_a = cuda::memory::make_unique_span<datum>(device, n);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(n)
@@ -80,9 +79,9 @@ int main(int, char **)
 	auto stream = device.default_stream(); // device.create_stream(cuda::stream::async);
 	auto cpu_time_start = std::chrono::high_resolution_clock::now();
 	stream.enqueue.event(start_event);
-	stream.enqueue.copy(d_a.get(), a.get(), num_bytes);
-	stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.get(), value);
-	stream.enqueue.copy(a.get(), d_a.get(), num_bytes);
+	stream.enqueue.copy(d_a, a);
+	stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value);
+	stream.enqueue.copy(a, d_a);
 	stream.enqueue.event(end_event);
 	auto cpu_time_end = std::chrono::high_resolution_clock::now();
 
@@ -99,7 +98,7 @@ int main(int, char **)
 	std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n';
 	std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n";
 
-	auto bFinalResults = correct_output(a.get(), n, value);
+	auto bFinalResults = correct_output(a, value);
 
 	std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n';
 

diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
@@ -30,7 +30,6 @@ int main()
 	}
 
 	int numElements = 50000;
-	size_t size = numElements * sizeof(float);
 	std::cout << "[Vector addition of " << numElements << " elements]\n";
 
 	// If we could rely on C++14, we would  use std::make_unique
@@ -43,12 +42,12 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_B = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_C = cuda::memory::make_unique_span<float>(device, numElements);
 
-	cuda::memory::copy(d_A.get(), h_A.get(), size);
-	cuda::memory::copy(d_B.get(), h_B.get(), size);
+	cuda::memory::copy(d_A, h_A.get());
+	cuda::memory::copy(d_B, h_B.get());
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(numElements)
@@ -61,10 +60,10 @@ int main()
 
 	cuda::launch(
 		vectorAdd, launch_config,
-		d_A.get(), d_B.get(), d_C.get(), numElements
+		d_A.data(), d_B.data(), d_C.data(), numElements
 	);
 
-	cuda::memory::copy(h_C.get(), d_C.get(), size);
+	cuda::memory::copy(h_C.get(), d_C);
 
 	// Verify that the result vector is correct
 	for (int i = 0; i < numElements; ++i) {

diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
@@ -32,6 +32,7 @@
 #endif
 #include "api/unique_ptr.hpp"
 #include "api/unique_region.hpp"
+#include "api/unique_span.hpp"
 #include "api/link_options.hpp"
 
 #include "api/device.hpp"
@@ -69,6 +70,7 @@
 #include "api/multi_wrapper_impls/stream.hpp"
 #include "api/multi_wrapper_impls/memory.hpp"
 #include "api/multi_wrapper_impls/unique_region.hpp"
+#include "api/multi_wrapper_impls/unique_span.hpp"
 #include "api/multi_wrapper_impls/virtual_memory.hpp"
 #include "api/multi_wrapper_impls/kernel.hpp"
 #include "api/multi_wrapper_impls/kernel_launch.hpp"

diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
@@ -0,0 +1,109 @@
+/**
+ * @file
+ *
+ * @brief Implementations of @ref `make_unique_span()` functions
+ */
+#pragma once
+#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
+#define MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
+
+#include "../unique_span.hpp"
+#include "../current_device.hpp"
+#include "../current_context.hpp"
+#include "../primary_context.hpp"
+#include "../memory.hpp"
+#include "../types.hpp"
+
+namespace cuda {
+
+namespace memory {
+
+namespace device {
+
+template <typename T>
+unique_span<T> make_unique_span(const context_t& context, cuda::size_t num_elements)
+{
+	return detail_::make_unique_span<T>(context.handle(), num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param device        on which to construct the array of elements
+ * @param num_elements  the number of elements to allocate
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+template <typename T>
+unique_span<T> make_unique_span(const device_t& device, size_t num_elements)
+{
+	auto pc = device.primary_context();
+	CAW_SET_SCOPE_CONTEXT(pc.handle());
+	return make_unique_span<T>(pc, num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory on the current device.
+ *
+ * @note The allocation will be made in the device's primary context -
+ * which will be created if it has not yet been.
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+template <typename T>
+unique_span<T> make_unique_span(size_t num_elements)
+{
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+	return make_unique_span<T>(pc, num_elements);
+}
+
+} // namespace device
+
+namespace managed {
+
+template <typename T>
+unique_span<T> make_unique_span(
+	const context_t&      context,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
+{
+	CAW_SET_SCOPE_CONTEXT(context.handle());
+	return unique_span<T>{ detail_::allocate_in_current_context(num_bytes, initial_visibility) };
+}
+
+template <typename T>
+unique_span<T> make_unique_span(
+	const device_t&       device,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
+{
+	auto pc = device.primary_context();
+	return make_unique_span<T>(pc, num_bytes, initial_visibility);
+}
+
+template <typename T>
+unique_span<T> make_unique_span(
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
+{
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+	return make_unique_span<T>(pc, num_bytes, initial_visibility);
+}
+
+} // namespace managed
+
+} // namespace memory
+
+} // namespace cuda
+
+#endif // MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
+
diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp
@@ -244,6 +244,7 @@ inline unique_region make_unique_region(
 } // namespace managed
 
 } // namespace memory
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_