diff --git a/examples/by_api_module/event_management.cu b/examples/by_api_module/event_management.cu
index da959c6e..44ac9af5 100644
--- a/examples/by_api_module/event_management.cu
+++ b/examples/by_api_module/event_management.cu
@@ -94,13 +94,12 @@ int main(int argc, char **argv)
 		cuda::event::do_record_timings,
 		cuda::event::not_interprocess);
 
-	constexpr size_t buffer_size = 12345678;
-	auto buffer = cuda::memory::managed::make_unique<char[]>(
-		device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
+	auto buffer = cuda::memory::managed::make_unique_span<char>(
+		device, 12345678, cuda::memory::managed::initial_visibility_t::to_all_devices);
 	auto wrapped_kernel = cuda::kernel::get(device, increment);
 	auto launch_config = cuda::launch_config_builder()
 		.kernel(&wrapped_kernel)
-		.overall_size(buffer_size)
+		.overall_size(buffer.size())
 		.use_maximum_linear_block()
 		.build();
 
@@ -110,7 +109,7 @@ int main(int argc, char **argv)
 		report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
 	};
 	stream.enqueue.host_invokable(first_callback);
-	stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
+	stream.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer.size());
 	auto second_callback = [&] {
 		report_occurrence("In second callback (enqueued after the first kernel but before the second event)",
 			event_1, event_2);
@@ -136,7 +135,7 @@ int main(int argc, char **argv)
 	report_occurrence("After synchronizing on event_2, but before synchronizing on the stream", event_1, event_2);
 	std::cout
 		<< cuda::event::time_elapsed_between(event_1, event_2).count() << " msec have elapsed, "
-		<< "executing the second kernel (\"increment\") on a buffer of " << buffer_size
+		<< "executing the second kernel (\"increment\") on a buffer of " << buffer.size()
 		<< " chars and triggering two callbacks.\n";
 	// ... and this should make the third kernel execute
 	stream.synchronize();
diff --git a/examples/by_api_module/stream_management.cu b/examples/by_api_module/stream_management.cu
index f03b8120..5583c392 100644
--- a/examples/by_api_module/stream_management.cu
+++ b/examples/by_api_module/stream_management.cu
@@ -154,21 +154,21 @@ int main(int argc, char **argv)
 #endif
 
 	constexpr auto buffer_size = 12345678;
-	auto buffer = cuda::memory::managed::make_unique<char[]>(
+	auto buffer = cuda::memory::managed::make_unique_span<char>(
 		buffer_size,
 		device.supports_concurrent_managed_access() ?
 			cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access:
 			cuda::memory::managed::initial_visibility_t::to_all_devices);
-	print_first_char(buffer.get());
-	std::fill(buffer.get(), buffer.get() + buffer_size, 'a');
-	print_first_char(buffer.get());
+	print_first_char(buffer.data());
+	std::fill(buffer.begin(), buffer.end(), 'a');
+	print_first_char(buffer.data());
 
 	auto event_1 = cuda::event::create(device, cuda::event::sync_by_blocking);
 	stream_1.enqueue.kernel_launch(print_message<N,3>, single_thread_config, message<N>("I'm on stream 1"));
-	stream_1.enqueue.memset(buffer.get(), 'b', buffer_size);
+	stream_1.enqueue.memset(buffer, 'b');
 	auto callback = [&]() {
 		std::cout << "Callback from stream 1!... \n";
-		print_first_char(buffer.get());
+		print_first_char(buffer.data());
 	};
 	stream_1.enqueue.host_invokable(callback);
 	auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
@@ -177,15 +177,15 @@ int main(int argc, char **argv)
 	// TODO: The following doesn't have much of a meaningful effect; we should modify this example
 	// so that the attachment has some observable effect
 	stream_1.enqueue.attach_managed_region(buffer.get());
-	stream_1.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
+	stream_1.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer_size);
 	event_1.record(stream_1);
 	stream_1.enqueue.kernel_launch(print_message<N,4>, single_thread_config, message<N>("I'm on stream 1"));
 	stream_2.enqueue.wait(event_1);
-	stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.get());
+	stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.data());
 	stream_2.enqueue.kernel_launch(print_message<N,5>, single_thread_config, message<N>("I'm on stream 2"));
 	bool idleness_1 = stream_2.has_work_remaining();
 	device.synchronize();
-	print_first_char(buffer.get());
+	print_first_char(buffer.data());
 	// cuda::memory::managed::free(buffer);
 	bool idleness_2 = stream_2.has_work_remaining();
 	std::cout << std::boolalpha
diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
index 996f29b2..f578cfdc 100644
--- a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
+++ b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -29,12 +29,12 @@ __global__ void increment_kernel(datum*g_data, datum inc_value)
 	g_data[global_idx] = g_data[global_idx] + inc_value;
 }
 
-bool correct_output(int *data, const int n, const int x)
+bool correct_output(cuda::span<const int> data, const int x)
 {
-	for (int i = 0; i < n; i++)
+	for (size_t i = 0; i < data.size(); i++)
 		if (data[i] != x)
 		{
-			printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+			printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
 			return false;
 		}
 	return true;
@@ -51,15 +51,14 @@ int main(int, char **)
 
 	std::cout << "CUDA device [" <<  device.name() << "]\n";
 
-	int n = 16 * 1024 * 1024;
-	int num_bytes = n * sizeof(datum);
+	const int n = 16 * 1024 * 1024;
 	int value = 26;
 
 	// allocate host memory
-	auto a = cuda::memory::host::make_unique<datum[]>(n);
-	cuda::memory::host::zero(a.get(), num_bytes);
+	auto a = cuda::memory::host::make_unique_span<datum>(n);
+	cuda::memory::host::zero(a);
 
-	auto d_a = cuda::memory::make_unique<datum[]>(device, n);
+	auto d_a = cuda::memory::make_unique_span<datum>(device, n);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(n)
@@ -80,9 +79,9 @@ int main(int, char **)
 	auto stream = device.default_stream(); // device.create_stream(cuda::stream::async);
 	auto cpu_time_start = std::chrono::high_resolution_clock::now();
 	stream.enqueue.event(start_event);
-	stream.enqueue.copy(d_a.get(), a.get(), num_bytes);
-	stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.get(), value);
-	stream.enqueue.copy(a.get(), d_a.get(), num_bytes);
+	stream.enqueue.copy(d_a, a);
+	stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value);
+	stream.enqueue.copy(a, d_a);
 	stream.enqueue.event(end_event);
 	auto cpu_time_end = std::chrono::high_resolution_clock::now();
 
@@ -99,7 +98,7 @@ int main(int, char **)
 	std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n';
 	std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n";
 
-	auto bFinalResults = correct_output(a.get(), n, value);
+	auto bFinalResults = correct_output(a, value);
 
 	std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n';
 
diff --git a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
index bba9ac0f..d21c91c5 100644
--- a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
+++ b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
@@ -89,17 +89,17 @@ int main()
 		std::unique_ptr<float[]>(new float[nElements])
 	);
 
-	auto device_buffer = cuda::memory::device::make_unique<float[]>(nElements);
+	auto device_buffer = cuda::memory::device::make_unique_span<float>(nElements);
 
 	auto pinned_host_buffers = std::make_pair(
-		cuda::memory::host::make_unique<float[]>(nElements),
-		cuda::memory::host::make_unique<float[]>(nElements)
+		cuda::memory::host::make_unique_span<float>(nElements),
+		cuda::memory::host::make_unique_span<float>(nElements)
 	);
 
 	auto h_aPageable = pageable_host_buffers.first.get();
 	auto h_bPageable = pageable_host_buffers.second.get();
-	auto h_aPinned = pinned_host_buffers.first.get();
-	auto h_bPinned = pinned_host_buffers.second.get();
+	auto h_aPinned = pinned_host_buffers.first.data();
+	auto h_bPinned = pinned_host_buffers.second.data();
 
 	std::iota(h_aPageable, h_aPageable + nElements, 0.0);
 	cuda::memory::copy(h_aPinned, h_aPageable, bytes);
@@ -112,6 +112,6 @@ int main()
 	std::cout << "\nTransfer size (MB): " << (bytes / Mi) << "\n";
 
 	// perform copies and report bandwidth
-	profileCopies(h_aPageable, h_bPageable, device_buffer.get(), nElements, "Pageable");
-	profileCopies(h_aPinned, h_bPinned, device_buffer.get(), nElements, "Pinned");
+	profileCopies(h_aPageable, h_bPageable, device_buffer.data(), nElements, "Pageable");
+	profileCopies(h_aPinned, h_bPinned, device_buffer.data(), nElements, "Pinned");
 }
diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
index 633d37dc..5f347b64 100644
--- a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
+++ b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
@@ -154,20 +154,20 @@ int main()
 	{
 		const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block;
 
-		auto d_input = cuda::memory::make_unique<float[]>(device, input_size);
-		auto d_output = cuda::memory::make_unique<float[]>(device, num_blocks);
+		auto d_input = cuda::memory::make_unique_span<float>(device, input_size);
+		auto d_output = cuda::memory::make_unique_span<float>(device, num_blocks);
 			// Note: We won't actually be checking the output...
-		auto d_timers = cuda::memory::make_unique<clock_t []>(device, num_timers);
-		cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float));
+		auto d_timers = cuda::memory::make_unique_span<clock_t>(device, num_timers);
+		cuda::memory::copy(d_input, input.get());
 
 		auto launch_config = cuda::launch_config_builder()
 			.num_blocks(num_blocks)
 			.block_size(num_threads_per_block)
 			.dynamic_shared_memory_size(dynamic_shared_mem_size)
 			.build();
-		cuda::launch(kernel_in_module, launch_config, d_input.get(), d_output.get(), d_timers.get());
+		cuda::launch(kernel_in_module, launch_config, d_input.data(), d_output.data(), d_timers.data());
 		device.synchronize();
-		cuda::memory::copy(timers.get(), d_timers.get(), num_timers * sizeof(clock_t));
+		cuda::memory::copy(timers.get(), d_timers);
 	} // The allocated device buffers are released here
 	long double average_elapsed_clock_ticks_per_block = compute_average_elapsed_clocks(timers.get(), num_blocks);
 
diff --git a/examples/modified_cuda_samples/graphMemoryNodes/graphMemoryNodes.cu b/examples/modified_cuda_samples/graphMemoryNodes/graphMemoryNodes.cu
new file mode 100644
index 00000000..f3d071a2
--- /dev/null
+++ b/examples/modified_cuda_samples/graphMemoryNodes/graphMemoryNodes.cu
@@ -0,0 +1,560 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2023, Eyal Rozenberg <eyalroz1@gmx.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// System includes
+#include <assert.h>
+#include <stdio.h>
+
+#include <climits>
+#include <vector>
+
+// CUDA runtime
+#include <cuda/api.hpp>
+
+// helper functions and utilities to work with CUDA
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+#define THREADS_PER_BLOCK 512
+#define ALLOWABLE_VARIANCE 1.e-6f
+#define NUM_ELEMENTS 8000000
+
+// Stores the square of each input element in output array
+__global__ void squareArray(const float *input, float *output,
+							int numElements)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (idx < numElements) {
+		output[idx] = input[idx] * input[idx];
+	}
+}
+
+// Stores the negative of each input element in output array
+__global__ void negateArray(const float *input, float *output,
+							int numElements)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (idx < numElements) {
+		output[idx] = input[idx] * -1;
+	}
+}
+
+struct negSquareArrays {
+	float *input;
+	float *square;
+	float *negSquare;
+	int numElements;
+	size_t bytes;
+	size_t numBlocks;
+};
+
+void fillRandomly(float *array, int numElements)
+{
+	for (int n = 0; n < numElements; n++) {
+		array[n] = rand() / (float) RAND_MAX;
+	}
+}
+
+void resetOutputArrays(negSquareArrays *hostArrays)
+{
+	fillRandomly(hostArrays->square, hostArrays->numElements);
+	fillRandomly(hostArrays->negSquare, hostArrays->numElements);
+}
+
+void prepareHostArrays(negSquareArrays *hostArrays)
+{
+	hostArrays->numElements = NUM_ELEMENTS;
+	size_t bytes = hostArrays->numElements * sizeof(float);
+
+	size_t numBlocks = hostArrays->numElements / (size_t) THREADS_PER_BLOCK;
+	if ((numBlocks % (size_t) THREADS_PER_BLOCK) != 0) {
+		numBlocks++;
+	}
+
+	hostArrays->input = (float *) malloc(bytes);
+	hostArrays->square = (float *) malloc(bytes);
+	hostArrays->negSquare = (float *) malloc(bytes);
+	hostArrays->bytes = bytes;
+	hostArrays->numBlocks = numBlocks;
+
+	fillRandomly(hostArrays->input, hostArrays->numElements);
+	fillRandomly(hostArrays->square, hostArrays->numElements);
+	fillRandomly(hostArrays->negSquare, hostArrays->numElements);
+}
+
+cuda::graph::instance_t createFreeGraph(float *dPtr)
+{
+	cudaGraphNode_t freeNode;
+
+	auto graph = cuda::graph::create();
+	auto node = graph.insert.node<cuda::graph::node::kind_t::memory_free>(dPtr);
+	return graph.instantiate();
+}
+
+/**
+ * Demonstrates explicitly creating a CUDA graph including memory nodes.
+ * createNegateSquaresGraphWithStreamCapture constructs an equivalent graph
+ * using stream capture.
+ *
+ * If d_negSquare_out is non null, then:
+ * 1) d_negSquare will not be freed;
+ * 2) the value of d_negSquare_out will be set to d_negSquare.
+ *
+ * Diagram of the graph constructed by createNegateSquaresGraphExplicitly:
+ *
+ * alloc d_input
+ *       |
+ * alloc d_square
+ *       |
+ * Memcpy a to device
+ *       |
+ * launch kernel squareArray ------->---- Memcpy d_square to host
+ *       |                                      |
+ * free d_input                                 |
+ *       |                                      |
+ * allocate d_negSquare                         |
+ *       |                                      |
+ * launch kernel negateArray -------->--- free d_square
+ *       |
+ * Memcpy d_negSquare to host
+ *       |
+ * free d_negSquare
+ */
+std::pair<cuda::graph::instance_t, float*>
+createNegateSquaresGraphExplicitly(int device, negSquareArrays *hostArrays, bool do_neg_squares)
+{
+	// Array buffers on device
+	float *d_input, *d_square, *d_negSquare;
+
+	// Memory allocation parameters
+	cudaMemAllocNodeParams allocParams;
+	memset(&allocParams, 0, sizeof(allocParams));
+	allocParams.bytesize = hostArrays->bytes;
+	allocParams.poolProps.allocType = cudaMemAllocationTypePinned;
+	allocParams.poolProps.location.id = device;
+	allocParams.poolProps.location.type = cudaMemLocationTypeDevice;
+
+	// Kernel launch parameters
+	cudaKernelNodeParams kernelNodeParams = {0};
+	kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1);
+	kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
+	kernelNodeParams.sharedMemBytes = 0;
+	kernelNodeParams.extra = NULL;
+
+	cudaGraph_t graph;
+	cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare;
+	cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare;
+	cudaGraphNode_t squareKernelNode, negateKernelNode;
+	cudaGraphNode_t freeNodeInput, freeNodeSquare;
+
+	// Buffer for storing graph node dependencies
+	std::vector<cudaGraphNode_t> nodeDependencies;
+
+	checkCudaErrors(cudaGraphCreate(&graph, 0));
+
+	checkCudaErrors(
+		cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams));
+	d_input = (float *) allocParams.dptr;
+
+	// To keep the graph structure simple (fewer branching dependencies),
+	// allocNodeSquare should depend on allocNodeInput
+	checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph,
+		&allocNodeInput, 1, &allocParams));
+	d_square = (float *) allocParams.dptr;
+
+	// copyNodeInput needs to depend on allocNodeInput because copyNodeInput
+	// writes to d_input. It does so here indirectly through allocNodeSquare.
+	checkCudaErrors(cudaGraphAddMemcpyNode1D(
+		&copyNodeInput, graph, &allocNodeSquare, 1, d_input, hostArrays->input,
+		hostArrays->bytes, cudaMemcpyHostToDevice));
+
+	void *squareKernelArgs[3] = {(void *) &d_input, (void *) &d_square,
+								 (void *) &(hostArrays->numElements)};
+	kernelNodeParams.func = (void *) squareArray;
+	kernelNodeParams.kernelParams = (void **) squareKernelArgs;
+
+	// Square kernel depends on copyNodeInput to ensure all data is on the device
+	// before kernel launch.
+	checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph,
+		&copyNodeInput, 1, &kernelNodeParams));
+
+	checkCudaErrors(cudaGraphAddMemcpyNode1D(
+		&copyNodeSquare, graph, &squareKernelNode, 1, hostArrays->square,
+		d_square, hostArrays->bytes, cudaMemcpyDeviceToHost));
+
+	// Free of d_input depends on the square kernel to ensure that d_input is not
+	// freed while being read by the kernel. It also depends on the alloc of
+	// d_input via squareKernelNode > copyNodeInput > allocNodeSquare >
+	// allocNodeInput.
+	checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph,
+		&squareKernelNode, 1, d_input));
+
+	// Allocation of C depends on free of A so CUDA can reuse the virtual address.
+	checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph,
+		&freeNodeInput, 1, &allocParams));
+	d_negSquare = (float *) allocParams.dptr;
+
+	if (d_negSquare == d_input) {
+		printf(
+			"Check verified that d_negSquare and d_input share a virtual "
+			"address.\n");
+	}
+
+	void *negateKernelArgs[3] = {(void *) &d_square, (void *) &d_negSquare,
+								 (void *) &(hostArrays->numElements)};
+	kernelNodeParams.func = (void *) negateArray;
+	kernelNodeParams.kernelParams = (void **) negateKernelArgs;
+
+	checkCudaErrors(cudaGraphAddKernelNode(
+		&negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams));
+
+	nodeDependencies.push_back(copyNodeSquare);
+	nodeDependencies.push_back(negateKernelNode);
+	checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeSquare, graph,
+		nodeDependencies.data(),
+		nodeDependencies.size(), d_square));
+	nodeDependencies.clear();
+
+	checkCudaErrors(cudaGraphAddMemcpyNode1D(
+		&copyNodeNegSquare, graph, &negateKernelNode, 1, hostArrays->negSquare,
+		d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost));
+
+	if (d_negSquare_out == NULL) {
+		cudaGraphNode_t freeNodeNegSquare;
+		checkCudaErrors(cudaGraphAddMemFreeNode(
+			&freeNodeNegSquare, graph, &copyNodeNegSquare, 1, d_negSquare));
+	}
+	else {
+		*d_negSquare_out = d_negSquare;
+	}
+
+	checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
+	checkCudaErrors(cudaGraphDestroy(graph));
+}
+
+cuda::graph::instance_t
+createNegateSquaresGraphExplicitly(int device, negSquareArrays *hostArrays)
+{
+	static constexpr const auto do_neg_squares { true };
+	return createNegateSquaresGraphExplicitly(device, hostArrays, do_neg_squares).first;
+}
+
+/**
+ * Adds work to a CUDA stream which negates the square of values in the input
+ * array.
+ *
+ * If d_negSquare_out is non null, then:
+ * 1) d_negSquare will not be freed;
+ * 2) the value of d_negSquare_out will be set to d_negSquare.
+ *
+ * Diagram of the stream operations in doNegateSquaresInStream
+ * ---------------------------------------------------------------------
+ * | STREAM                             | STREAM2                      |
+ * ---------------------------------------------------------------------
+ *
+ * alloc d_input
+ *       |
+ * alloc d_square
+ *       |
+ * Memcpy a to device
+ *       |
+ * launch kernel squareArray
+ *       |
+ * record squareKernelCompleteEvent -->-- wait squareKernelCompleteEvent
+ *       |                                      |
+ * free d_input                                 |
+ *       |                                      |
+ * allocate d_negSquare                   Memcpy d_square to host
+ *       |                                      |
+ * launch kernel negateArray                    |
+ *       |                                      |
+ * record negateKernelCompleteEvent -->-- wait negateKernelCompleteEvent
+ *       |                                      |
+ * Memcpy d_negSquare to host                   |
+ *       |                                free d_square
+ * free d_negSquare                             |
+ *       |                                      |
+ * wait squareFreeEvent --------------<---- record squareFreeEvent
+ */
+void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays,
+							 float **d_negSquare_out = NULL)
+{
+	float *d_input, *d_square, *d_negSquare;
+	cudaStream_t stream2;
+	cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent,
+		squareFreeEvent;
+
+	checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
+
+	checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent));
+	checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent));
+	checkCudaErrors(cudaEventCreate(&squareFreeEvent));
+
+	// Virtual addresses are assigned synchronously when cudaMallocAsync is
+	// called, thus there is no performace benefit gained by separating the
+	// allocations into two streams.
+	checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1));
+	checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1));
+
+	checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes,
+		cudaMemcpyHostToDevice, stream1));
+	squareArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
+		d_input, d_square, hostArrays->numElements);
+	checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1));
+
+	checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0));
+	checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square,
+		hostArrays->bytes, cudaMemcpyDeviceToHost,
+		stream2));
+
+	checkCudaErrors(cudaFreeAsync(d_input, stream1));
+	checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1));
+	negateArray<<<hostArrays->numBlocks, THREADS_PER_BLOCK, 0, stream1>>>(
+		d_square, d_negSquare, hostArrays->numElements);
+	checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1));
+	checkCudaErrors(cudaMemcpyAsync(hostArrays->negSquare, d_negSquare,
+		hostArrays->bytes, cudaMemcpyDeviceToHost,
+		stream1));
+	if (d_negSquare_out == NULL) {
+		checkCudaErrors(cudaFreeAsync(d_negSquare, stream1));
+	}
+	else {
+		*d_negSquare_out = d_negSquare;
+	}
+
+	checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0));
+	checkCudaErrors(cudaFreeAsync(d_square, stream2));
+	checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2));
+
+	checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0));
+
+	checkCudaErrors(cudaStreamDestroy(stream2));
+	checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent));
+	checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent));
+	checkCudaErrors(cudaEventDestroy(squareFreeEvent));
+}
+
+/**
+ * Demonstrates creating a CUDA graph including memory nodes using stream
+ * capture. createNegateSquaresGraphExplicitly constructs an equivalent graph
+ * without stream capture.
+ */
+cuda::graph::instance_t createNegateSquaresGraphWithStreamCapture(negSquareArrays *hostArrays,
+											   float **d_negSquare_out = NULL)
+{
+	cudaGraph_t graph;
+	cudaStream_t stream;
+
+	checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+	checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+	doNegateSquaresInStream(stream, hostArrays, d_negSquare_out);
+	checkCudaErrors(cudaStreamEndCapture(stream, &graph));
+
+	checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0));
+	checkCudaErrors(cudaStreamDestroy(stream));
+	checkCudaErrors(cudaGraphDestroy(graph));
+}
+
+void prepareRefArrays(negSquareArrays *hostArrays,
+					  negSquareArrays *deviceRefArrays,
+					  bool **foundValidationFailure)
+{
+	deviceRefArrays->bytes = hostArrays->bytes;
+	deviceRefArrays->numElements = hostArrays->numElements;
+
+	for (int i = 0; i < hostArrays->numElements; i++) {
+		hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i];
+		hostArrays->negSquare[i] = hostArrays->square[i] * -1;
+	}
+
+	checkCudaErrors(
+		cudaMalloc((void **) &deviceRefArrays->negSquare, deviceRefArrays->bytes));
+	checkCudaErrors(cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare,
+		hostArrays->bytes, cudaMemcpyHostToDevice));
+
+	checkCudaErrors(
+		cudaMallocManaged((void **) foundValidationFailure, sizeof(bool)));
+}
+
+int checkValidationFailure(bool *foundValidationFailure)
+{
+	if (*foundValidationFailure) {
+		std::cout << "Validation FAILURE!\n\n";
+		*foundValidationFailure = false;
+		return EXIT_FAILURE;
+	}
+	else {
+		std::cout << "Validation PASSED!\n\n";
+		return EXIT_SUCCESS;
+	}
+}
+
+__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays,
+							bool *foundValidationFailure)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	float ref, diff;
+
+	if (idx < devRefArrays.numElements) {
+		ref = devRefArrays.negSquare[idx];
+		diff = d_negSquare[idx] - ref;
+		diff *= diff;
+		ref *= ref;
+		if (diff / ref > ALLOWABLE_VARIANCE) {
+			*foundValidationFailure = true;
+		}
+	}
+}
+
+void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure)
+{
+	float ref, diff;
+
+	for (int i = 0; i < hostArrays->numElements; i++) {
+		ref = hostArrays->input[i] * hostArrays->input[i] * -1;
+		diff = hostArrays->negSquare[i] - ref;
+		diff *= diff;
+		ref *= ref;
+		if (diff / ref > ALLOWABLE_VARIANCE) {
+			*foundValidationFailure = true;
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	negSquareArrays hostArrays, deviceRefArrays;
+
+	auto launch_config = cuda::launch_config_builder()
+		.block_dimensions(THREADS_PER_BLOCK)
+		.grid_dimensions(hostArrays.numBlocks)
+		.no_dynamic_shared_memory()
+		.build();
+
+	// Declare pointers for GPU buffers
+	float *d_negSquare = NULL;
+	bool *foundValidationFailure = NULL;
+
+	srand(time(0));
+
+	// Being very cavalier about our command-line arguments here...
+	cuda::device::id_t device_id = (argc > 1) ? std::stoi(argv[1]) : cuda::device::default_device_id;
+	auto device = cuda::device::get(device_id);
+
+	if (cuda::version_numbers::driver() < cuda::version_numbers::make(11040)) {
+		std::cout << "Waiving execution as driver does not support Graph Memory Nodes\n";
+		exit(EXIT_SUCCESS);
+	}
+
+	if (not device.supports_memory_pools()) {
+		std::cout << "Waiving execution as device does not support Memory Pools\n";
+		exit(EXIT_SUCCESS);
+	}
+	std::cout << "Setting up sample.\n";
+
+	prepareHostArrays(&hostArrays);
+	prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure);
+	auto stream = device.create_stream(cuda::stream::async);
+	std::cout << "Setup complete.\n\n";
+
+	std::cout << "Running negateSquares in a stream.\n";
+	doNegateSquaresInStream(stream.handle(), &hostArrays);
+	std::cout << "Validating negateSquares in a stream...\n";
+	validateHost(&hostArrays, foundValidationFailure);
+	checkValidationFailure(foundValidationFailure);
+	resetOutputArrays(&hostArrays);
+
+	{
+		std::cout << "Running negateSquares in a stream-captured graph.\n";
+		auto executable_graph_instance = createNegateSquaresGraphWithStreamCapture(&hostArrays);
+		cuda::graph::launch(executable_graph_instance, stream);
+		stream.synchronize();
+		std::cout << "Validating negateSquares in a stream-captured graph...\n";
+		validateHost(&hostArrays, foundValidationFailure);
+		checkValidationFailure(foundValidationFailure);
+		resetOutputArrays(&hostArrays);
+	}
+
+	{
+		std::cout << "Running negateSquares in an explicitly constructed graph.\n";
+		auto executable_graph_instance = createNegateSquaresGraphExplicitly(device.id(), &hostArrays);
+		cuda::graph::launch(executable_graph_instance, stream);
+		stream.synchronize();
+		std::cout << "Validating negateSquares in an explicitly constructed graph...\n";
+		validateHost(&hostArrays, foundValidationFailure);
+		checkValidationFailure(foundValidationFailure);
+		resetOutputArrays(&hostArrays);
+	}
+	// Each of the three examples below free d_negSquare outside the graph. As
+	// demonstrated by validateGPU, d_negSquare can be accessed by outside the
+	// graph before d_negSquare is freed.
+
+	{
+		std::cout << "Running negateSquares with d_negSquare freed outside the stream.\n";
+		static constexpr const auto compute_neg_squares { true };
+		auto pair = createNegateSquaresGraphExplicitly(device.id(), &hostArrays, compute_neg_squares);
+		auto executable_graph_instance = std::move(pair.first);
+		auto d_negSquare = std::move(pair.second);
+		auto free_graph_instance = createFreeGraph(d_negSquare);
+		cuda::graph::launch(executable_graph_instance, stream);
+		stream.enqueue.kernel_launch(validateGPU, launch_config, d_negSquare, deviceRefArrays, foundValidationFailure);
+		stream.synchronize();
+		printf(
+			"Validating negateSquares with d_negSquare freed outside the "
+			"stream...\n");
+		validateHost(&hostArrays, foundValidationFailure);
+		checkValidationFailure(foundValidationFailure);
+		resetOutputArrays(&hostArrays);
+
+		std::cout << "Running negateSquares with d_negSquare freed outside the graph.\n";
+		cuda::graph::launch(executable_graph_instance, stream);
+		stream.enqueue.kernel_launch(validateGPU, launch_config, d_negSquare, deviceRefArrays, foundValidationFailure);
+		stream.synchronize();
+		printf(
+			"Validating negateSquares with d_negSquare freed outside the graph...\n");
+		checkValidationFailure(foundValidationFailure);
+		resetOutputArrays(&hostArrays);
+		// TODO: What about the instance vs the FreeC?
+		printf(
+			"Running negateSquares with d_negSquare freed in a different graph.\n");
+		cuda::graph::launch(executable_graph_instance, stream);
+		stream.enqueue.kernel_launch(validateGPU, launch_config, d_negSquare, deviceRefArrays, foundValidationFailure);
+		cuda::graph::launch(free_graph_instance, stream);
+		stream.synchronize();
+		printf(
+			"Validating negateSquares with d_negSquare freed in a different "
+			"graph...\n");
+		checkValidationFailure(foundValidationFailure);
+
+	}
+
+	std::cout << "\nSUCCESS\n";
+}
\ No newline at end of file
diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
index fb861605..32baa02b 100644
--- a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
+++ b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
@@ -43,26 +43,26 @@ int main(int, char **)
 	cuda::device::current::set_to_default();
 	auto device = cuda::device::current::get();
 
-	auto d_ptr = cuda::memory::make_unique<int[]>(device, N);
-	auto h_ptr = cuda::memory::host::make_unique<int[]>(N);
+	auto d_span = cuda::memory::make_unique_span<int>(device, N);
+	auto h_span = cuda::memory::host::make_unique_span<int>(N);
 
 	std::cout << "Generating data on CPU\n";
 
-	sequence_cpu(h_ptr.get(), N);
+	sequence_cpu(h_span.data(), h_span.size());
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(N)
 		.block_size(256)
 		.build();
-	device.launch(sequence_gpu, launch_config, d_ptr.get(), N);
+	device.launch(sequence_gpu, launch_config, d_span.data(), d_span.size());
 
 	cuda::outstanding_error::ensure_none();
 	device.synchronize();
 
-	auto h_d_ptr = cuda::memory::host::make_unique<int[]>(N);
-	cuda::memory::copy(h_d_ptr.get(), d_ptr.get(), N * sizeof(int));
+	auto h_d_span = cuda::memory::host::make_unique_span<int>(N);
+	cuda::memory::copy(h_d_span, d_span);
 
-	auto results_are_correct =	std::equal(h_ptr.get(), h_ptr.get() + N, h_d_ptr.get());
+	auto results_are_correct =	std::equal(h_span.begin(), h_span.end(), h_d_span.begin());
 	if (not results_are_correct) {
 		die_("Results check failed.");
 	}
diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
index 7592441d..d440b1d7 100644
--- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
+++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -124,9 +124,8 @@ void checkP2Paccess()
 }
 
 void enqueue_p2p_copy(
-    int *dest,
-    int *src,
-    std::size_t num_elems,
+    cuda::memory::region_t dest,
+	cuda::memory::region_t src,
     int repeat,
     bool p2paccess,
     P2PEngine p2p_mechanism,
@@ -145,7 +144,7 @@ void enqueue_p2p_copy(
         auto launch_config = cuda::launch_configuration_t{grid_and_block_dims};
 
         for (int r = 0; r < repeat; r++) {
-            stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest, (int4*)src, num_elems/sizeof(int4));
+            stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest.data(), (int4*)src.data(), src.size()/sizeof(int4));
         }
     }
     else
@@ -155,7 +154,7 @@ void enqueue_p2p_copy(
         // Since we assume Compute Capability >= 2.0, all devices support the
         // Unified Virtual Address Space, so we don't need to use
         // cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.
-            cuda::memory::async::copy(dest, src, sizeof(*dest)*num_elems, stream);
+            cuda::memory::async::copy(dest, src, stream);
         }
     }
 }
@@ -165,8 +164,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p
     int numElems = 10000000;
     int repeat = 5;
 	vector<cuda::stream_t> streams;
-    vector<cuda::memory::device::unique_ptr<int[]>> buffers;
-    vector<cuda::memory::device::unique_ptr<int[]>> buffersD2D; // buffer for D2D, that is, intra-GPU copy
+    vector<cuda::memory::device::unique_span<int>> buffers;
+    vector<cuda::memory::device::unique_span<int>> buffersD2D; // buffer for D2D, that is, intra-GPU copy
     vector<cuda::event_t> start;
     vector<cuda::event_t> stop;
 
@@ -176,8 +175,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p
 
     for (auto device : cuda::devices()) {
         streams.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique_span<int>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique_span<int>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -212,17 +211,17 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p
 
             if (i == j) {
                 // Perform intra-GPU, D2D copies
-                enqueue_p2p_copy(buffers[i].get(), buffersD2D[i].get(), numElems, repeat, p2p_access_possible, mechanism, streams[i]);
+                enqueue_p2p_copy(buffers[i], buffersD2D[i], repeat, p2p_access_possible, mechanism, streams[i]);
 
             }
             else {
                 if (p2p_method == P2P_WRITE)
                 {
-                    enqueue_p2p_copy(buffers[j].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, mechanism, streams[i]);
+                    enqueue_p2p_copy(buffers[j], buffers[i], repeat, p2p_access_possible, mechanism, streams[i]);
                 }
                 else
                 {
-                    enqueue_p2p_copy(buffers[i].get(), buffers[j].get(), numElems, repeat, p2p_access_possible, mechanism, streams[i]);
+                    enqueue_p2p_copy(buffers[i], buffers[j], repeat, p2p_access_possible, mechanism, streams[i]);
                 }
             }
 
@@ -295,8 +294,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p)
 
 	vector<cuda::stream_t> streams_0;
 	vector<cuda::stream_t> streams_1;
-    vector<cuda::memory::device::unique_ptr<int[]>> buffers;
-    vector<cuda::memory::device::unique_ptr<int[]>> buffersD2D; // buffer for D2D, that is, intra-GPU copy
+    vector<cuda::memory::device::unique_span<int>> buffers;
+    vector<cuda::memory::device::unique_span<int>> buffersD2D; // buffer for D2D, that is, intra-GPU copy
     vector<cuda::event_t> start;
     vector<cuda::event_t> stop;
 
@@ -308,8 +307,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p)
     for (auto device : cuda::devices()) {
         streams_0.push_back(device.create_stream(cuda::stream::async));
         streams_1.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique_span<int>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique_span<int>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -350,12 +349,12 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p)
 
             if (i == j) {
                 // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
-                enqueue_p2p_copy(buffers[i].get(), buffersD2D[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_0[i]);
-                enqueue_p2p_copy(buffersD2D[i].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_1[i]);
+                enqueue_p2p_copy(buffers[i], buffersD2D[i], repeat, p2p_access_possible, p2p_mechanism, streams_0[i]);
+                enqueue_p2p_copy(buffersD2D[i], buffers[i], repeat, p2p_access_possible, p2p_mechanism, streams_1[i]);
             }
             else {
-                enqueue_p2p_copy(buffers[i].get(), buffers[j].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_1[j]);
-                enqueue_p2p_copy(buffers[j].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_0[i]);
+                enqueue_p2p_copy(buffers[i], buffers[j], repeat, p2p_access_possible, p2p_mechanism, streams_1[j]);
+                enqueue_p2p_copy(buffers[j], buffers[i], repeat, p2p_access_possible, p2p_mechanism, streams_0[i]);
             }
 
             // Notify stream0 that stream1 is complete and record the time of
@@ -406,8 +405,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
 	//
 
 	vector<cuda::stream_t> streams;
-    vector<cuda::memory::device::unique_ptr<int[]>> buffers;
-    vector<cuda::memory::device::unique_ptr<int[]>> buffersD2D; // buffer for D2D, that is, intra-GPU copy
+    vector<cuda::memory::device::unique_span<int>> buffers;
+    vector<cuda::memory::device::unique_span<int>> buffersD2D; // buffer for D2D, that is, intra-GPU copy
     vector<cuda::event_t> start;
     vector<cuda::event_t> stop;
 
@@ -417,8 +416,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
 
     for(auto device : cuda::devices()) {
         streams.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique_span<int>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique_span<int>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -455,16 +454,16 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
             auto time_before_copy = std::chrono::high_resolution_clock::now();
             if (i == j) {
                 // Perform intra-GPU, D2D copies
-                enqueue_p2p_copy(buffers[i].get(), buffersD2D[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams[i]);
+                enqueue_p2p_copy(buffers[i], buffersD2D[i], repeat, p2p_access_possible, p2p_mechanism, streams[i]);
             }
             else {
                 if (p2p_method == P2P_WRITE)
                 {
-                    enqueue_p2p_copy(buffers[j].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams[i]);
+                    enqueue_p2p_copy(buffers[j], buffers[i], repeat, p2p_access_possible, p2p_mechanism, streams[i]);
                 }
                 else
                 {
-                    enqueue_p2p_copy(buffers[i].get(), buffers[j].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams[i]);
+                    enqueue_p2p_copy(buffers[i], buffers[j], repeat, p2p_access_possible, p2p_mechanism, streams[i]);
                 }
             }
             auto time_after_copy = std::chrono::high_resolution_clock::now();
diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
index ae7924c6..352d037f 100644
--- a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
+++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -147,13 +147,13 @@ int main(int argc, char** argv)
 	std::generate_n(h_B.get(), N, generator);
 
     // Allocate vectors in device memory
-	auto d_A = cuda::memory::make_unique<float[]>(device, N);
-	auto d_B = cuda::memory::make_unique<float[]>(device, N);
-	auto d_C = cuda::memory::make_unique<float[]>(device, N);
+	auto d_A = cuda::memory::make_unique_span<float>(device, N);
+	auto d_B = cuda::memory::make_unique_span<float>(device, N);
+	auto d_C = cuda::memory::make_unique_span<float>(device, N);
 
 
-	cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);
-	cuda::memory::async::copy(d_B.get(), h_B.get(), size, stream);
+	cuda::memory::async::copy(d_A, h_A.get(), size, stream);
+	cuda::memory::async::copy(d_B, h_B.get(), size, stream);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(N)
@@ -162,9 +162,9 @@ int main(int argc, char** argv)
 
     cuda::outstanding_error::ensure_none();
 
-    stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.get(), d_B.get(), d_C.get(), N);
+    stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N);
 
-	cuda::memory::async::copy(h_C.get(), d_C.get(), size, stream);
+	cuda::memory::async::copy(h_C.get(), d_C, size, stream);
 	stream.synchronize();
 
 	for (int i = 0; i < N; ++i) {
diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
index e68c5568..88230be3 100644
--- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
+++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -67,11 +67,12 @@ __global__ void init_array(int *g_data, const int *factor, int num_iterations)
 	}
 }
 
-bool check_resulting_data(const int *a, const int n, const int c)
+template <typename Container>
+bool check_resulting_data(Container const & container, const int c)
 {
-	for (int i = 0; i < n; i++) {
-		if (a[i] != c) {
-			std::cerr << i << ": " << a[i] << " " << c << "\n";
+	for (size_t i = 0; i < container.size(); i++) {
+		if (container[i] != c) {
+			std::cerr << i << ": " << container[i] << " " << c << "\n";
 			return false;
 		}
 	}
@@ -111,13 +112,13 @@ void run_simple_streams_example(
 	int c = 5;                      // value to which the array will be initialized
 
 	// Allocate Host memory
-	auto h_a = cuda::memory::host::make_unique<int[]>(params.n);
+	auto h_a = cuda::memory::host::make_unique_span<int>(params.n);
 
 	// allocate device memory
 	// pointers to data and init value in the device memory
-	auto d_a = cuda::memory::make_unique<int[]>(device, params.n);
-	auto d_c = cuda::memory::make_unique<int>(device);
-	cuda::memory::copy_single(d_c.get(), &c);
+	auto d_a = cuda::memory::make_unique_span<int>(device, params.n);
+	auto d_c = cuda::memory::make_unique_span<int>(device, 1);
+	cuda::memory::copy_single(d_c.data(), &c);
 
 	std::cout << "\nStarting Test\n";
 
@@ -142,7 +143,7 @@ void run_simple_streams_example(
 
 	// time memcpy from device
 	start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
-	cuda::memory::async::copy(h_a.get(), d_a.get(), nbytes, streams[0]);
+	cuda::memory::async::copy(h_a.get(), d_a, streams[0]);
 	stop_event.record();
 	stop_event.synchronize(); // block until the event is actually recorded
 	auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event);
@@ -154,7 +155,7 @@ void run_simple_streams_example(
 		.block_size(512)
 		.build();
 	start_event.record();
-	streams[0].enqueue.kernel_launch(init_array, launch_config, d_a.get(), d_c.get(), params.num_iterations);
+	streams[0].enqueue.kernel_launch(init_array, launch_config, d_a.data(), d_c.data(), params.num_iterations);
 	stop_event.record();
 	stop_event.synchronize();
 	auto time_kernel = cuda::event::time_elapsed_between(start_event, stop_event);
@@ -170,8 +171,8 @@ void run_simple_streams_example(
 
 	for (int k = 0; k < nreps; k++)
 	{
-		device.launch(init_array, launch_config, d_a.get(), d_c.get(), params.num_iterations);
-		cuda::memory::copy(h_a.get(), d_a.get(), nbytes);
+		device.launch(init_array, launch_config, d_a.data(), d_c.data(), params.num_iterations);
+		cuda::memory::copy(h_a.get(), d_a);
 	}
 
 	stop_event.record();
@@ -186,11 +187,11 @@ void run_simple_streams_example(
 		.block_size(512)
 		.build();
 	// TODO: Avoid need to push and pop here
-	memset(h_a.get(), 255, nbytes);     // set host memory bits to all 1s, for testing correctness
+	std::fill(h_a.begin(), h_a.end(), 255);     // set host memory bits to all 1s, for testing correctness
 	// This instruction is actually the only one in our program
 	// for which the device.make_current() command was necessary.
 	// TODO: Avoid having to do that altogether...
-	cuda::memory::device::zero(cuda::memory::region_t{d_a.get(), nbytes}); // set device memory to all 0s, for testing correctness
+	cuda::memory::device::zero(d_a); // set device memory to all 0s, for testing correctness
 	start_event.record();
 
 	for (int k = 0; k < nreps; k++)
@@ -199,7 +200,7 @@ void run_simple_streams_example(
 		for (int i = 0; i < nstreams; i++)
 		{
 			streams[i].enqueue.kernel_launch(
-				init_array, launch_config, d_a.get() + i * params.n / nstreams, d_c.get(), params.num_iterations);
+				init_array, launch_config, d_a.data() + i * params.n / nstreams, d_c.data(), params.num_iterations);
 		}
 
 		// asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
@@ -207,8 +208,8 @@ void run_simple_streams_example(
 		for (int i = 0; i < nstreams; i++)
 		{
 			cuda::memory::async::copy(
-				h_a.get() + i * params.n / nstreams,
-				d_a.get() + i * params.n / nstreams, nbytes / nstreams,
+				h_a.data() + i * params.n / nstreams,
+				d_a.data() + i * params.n / nstreams, nbytes / nstreams,
 				streams[i]);
 		}
 	}
@@ -220,7 +221,7 @@ void run_simple_streams_example(
 
 	// check whether the output is correct
 	std::cout << "-------------------------------\n";
-	if (not check_resulting_data(h_a.get(), params.n, c * nreps * params.num_iterations)) {
+	if (not check_resulting_data(h_a, c * nreps * params.num_iterations)) {
 		die_("Result check FAILED.");
 	}
 }
diff --git a/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu b/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu
index 9cfac063..1a6be6e2 100644
--- a/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu
+++ b/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu
@@ -155,9 +155,10 @@ int streamOrderedAllocationPostSync(
 	// Record the start event
 	auto start_event = stream.enqueue.event();
 	for (int i = 0; i < MAX_ITER; i++) {
-		auto d_a = span<float>(stream.enqueue.allocate(a.size() * sizeof(float)));
-		auto d_b = span<float>(stream.enqueue.allocate(b.size() * sizeof(float)));
-		auto d_c = span<float>(stream.enqueue.allocate(c.size() * sizeof(float)));
+		// Not: Not using unique_span's,
+		auto d_a = cuda::span<float>(stream.enqueue.allocate(a.size() * sizeof(float)));
+		auto d_b = cuda::span<float>(stream.enqueue.allocate(b.size() * sizeof(float)));
+		auto d_c = cuda::span<float>(stream.enqueue.allocate(c.size() * sizeof(float)));
 		stream.enqueue.copy(d_a, a);
 		stream.enqueue.copy(d_b, b);
 		stream.enqueue.kernel_launch(vectorAddGPU, launch_config, d_a.data(), d_b.data(), d_c.data(), c.size());
diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
index bf8e24f1..77a8857a 100644
--- a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
+++ b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
@@ -30,10 +30,8 @@ int main()
 	}
 
 	int numElements = 50000;
-	size_t size = numElements * sizeof(float);
 	std::cout << "[Vector addition of " << numElements << " elements]\n";
 
-	// If we could rely on C++14, we would  use std::make_unique
 	auto h_A = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_B = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_C = std::unique_ptr<float[]>(new float[numElements]);
@@ -43,12 +41,12 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_B = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_C = cuda::memory::make_unique_span<float>(device, numElements);
 
-	cuda::memory::copy(d_A.get(), h_A.get(), size);
-	cuda::memory::copy(d_B.get(), h_B.get(), size);
+	cuda::memory::copy(d_A, h_A.get());
+	cuda::memory::copy(d_B, h_B.get());
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(numElements)
@@ -61,10 +59,10 @@ int main()
 
 	cuda::launch(
 		vectorAdd, launch_config,
-		d_A.get(), d_B.get(), d_C.get(), numElements
+		d_A.data(), d_B.data(), d_C.data(), numElements
 	);
 
-	cuda::memory::copy(h_C.get(), d_C.get(), size);
+	cuda::memory::copy(h_C.get(), d_C);
 
 	// Verify that the result vector is correct
 	for (int i = 0; i < numElements; ++i) {
diff --git a/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp b/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp
index 384c1dd5..87ca20f9 100644
--- a/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp
+++ b/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp
@@ -99,8 +99,14 @@ cuda::size_t determine_reservation_size(
 
 template <template <typename...> class Container>
 struct reserved_range_and_mappings {
+	size_t requested_size;
 	virtual_mem::reserved_address_range_t reserved_range;
 	Container<virtual_mem::mapping_t> mappings;
+
+	cuda::memory::region_t as_requested() const noexcept
+	{
+		return reserved_range.region().subregion(0, requested_size);
+	}
 };
 
 /**
@@ -129,12 +135,12 @@ struct reserved_range_and_mappings {
  * plus meet the allocation granularity requirements of each device.
  */
 reserved_range_and_mappings<std::vector>
-setup_virtual_memory(cuda::size_t desired_region_size,
+setup_virtual_memory(cuda::size_t requested_region_size,
 	const vector<cuda::device_t> &backing_devices,
 	const vector<cuda::device_t> &mapping_devices,
 	virtual_mem::alignment_t alignment = virtual_mem::alignment::default_)
 {
-	auto size_to_reserve = determine_reservation_size(desired_region_size, backing_devices, mapping_devices);
+	auto size_to_reserve = determine_reservation_size(requested_region_size, backing_devices, mapping_devices);
 	auto stripe_size = size_to_reserve / backing_devices.size();
 	auto reserved_range = virtual_mem::reserve(size_to_reserve, alignment);
 
@@ -171,7 +177,7 @@ setup_virtual_memory(cuda::size_t desired_region_size,
 	}
 #endif
 
-	return { std::move(reserved_range), std::move(mappings) };
+	return { requested_region_size, std::move(reserved_range), std::move(mappings) };
 }
 
 //collect all of the devices whose memory can be mapped from a given device.
@@ -268,14 +274,12 @@ int main()
 	auto d_B = setup_virtual_memory(size_in_bytes, backing_devices, mapping_devices);
 	auto d_C = setup_virtual_memory(size_in_bytes, backing_devices, mapping_devices);
 
-	auto d_A_ptr = d_A.reserved_range.region().start();
-	auto d_B_ptr = d_B.reserved_range.region().start();
-	auto d_C_ptr = d_C.reserved_range.region().start();
-
-//	std::cout << "Done setting up virtual memory" << std::endl;
+	auto d_A_sp = d_A.as_requested().as_span<float>();
+	auto d_B_sp = d_B.as_requested().as_span<float>();
+	auto d_C_sp = d_C.as_requested().as_span<float>();
 
-	cuda::memory::copy(d_A_ptr, h_A.get(), size_in_bytes);
-	cuda::memory::copy(d_B_ptr, h_B.get(), size_in_bytes);
+	cuda::memory::copy(d_A_sp, h_A.get());
+	cuda::memory::copy(d_B_sp, h_B.get());
 
 	// Launch the Vector Add CUDA Kernel
 	auto launch_config = cuda::launch_config_builder()
@@ -288,10 +292,10 @@ int main()
 		<< " blocks of " << launch_config.dimensions.grid.volume() << " threads" << std::endl;
 
 	cuda::launch(kernel, launch_config,
-		d_A_ptr, d_B_ptr, d_C_ptr, num_elements
+		d_A_sp.data(), d_B_sp.data(), d_C_sp.data(), num_elements
 	);
 
-	cuda::memory::copy(h_C.get(), d_C_ptr, size_in_bytes);
+	cuda::memory::copy(h_C.get(), d_C_sp);
 
 //	std::cout << "Checking results...\n\n";
 
diff --git a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
index 78147c83..21f715c9 100644
--- a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
+++ b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
@@ -35,13 +35,13 @@ int main()
 	int numElements = 50000;
 	std::cout << "[Vector addition of " << numElements << " elements]\n";
 
-	auto buffer_A = cuda::memory::managed::make_unique<float[]>(numElements);
-	auto buffer_B = cuda::memory::managed::make_unique<float[]>(numElements);
-	auto buffer_C = cuda::memory::managed::make_unique<float[]>(numElements);
+	auto buffer_A = cuda::memory::managed::make_unique_span<float>(numElements);
+	auto buffer_B = cuda::memory::managed::make_unique_span<float>(numElements);
+	auto buffer_C = cuda::memory::managed::make_unique_span<float>(numElements);
 
 	auto generator = []() { return rand() / (float) RAND_MAX; };
-	std::generate(buffer_A.get(), buffer_A.get() + numElements, generator);
-	std::generate(buffer_B.get(), buffer_B.get() + numElements, generator);
+	std::generate(buffer_A.begin(), buffer_A.end(), generator);
+	std::generate(buffer_B.begin(), buffer_B.end(), generator);
 
 	// Launch the Vector Add CUDA Kernel
 	auto launch_config = cuda::launch_config_builder()
@@ -55,7 +55,7 @@ int main()
 
 	cuda::launch(
 		vectorAdd, launch_config,
-		buffer_A.get(), buffer_B.get(), buffer_C.get(), numElements
+		buffer_A.data(), buffer_B.data(), buffer_C.data(), numElements
 	);
 
 	// Synchronization is necessary here despite the synchronous nature of the default stream -
diff --git a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
index feb4e5e2..550b5a83 100644
--- a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
+++ b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
@@ -54,7 +54,6 @@ int main(void)
 	auto module = cuda::module::create(context, compilation_output);
 	auto vectorAdd = module.get_kernel(mangled_kernel_name);
 
-	// If we could rely on C++14, we would  use std::make_unique
 	auto h_A = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_B = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_C = std::unique_ptr<float[]>(new float[numElements]);
@@ -68,12 +67,12 @@ int main(void)
 	std::generate(h_A.get(), h_A.get() + numElements, generator);
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
-	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_B = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_C = cuda::memory::make_unique_span<float>(device, numElements);
 
-	cuda::memory::copy(d_A.get(), h_A.get(), size);
-	cuda::memory::copy(d_B.get(), h_B.get(), size);
+	cuda::memory::copy(d_A, h_A.get(), size);
+	cuda::memory::copy(d_B, h_B.get(), size);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(numElements)
@@ -89,7 +88,7 @@ int main(void)
 		d_A.get(), d_B.get(), d_C.get(), numElements
 	);
 
-	cuda::memory::copy(h_C.get(), d_C.get(), size);
+	cuda::memory::copy(h_C.get(), d_C, size);
 
 	// Verify that the result vector is correct
 	for (int i = 0; i < numElements; ++i) {
diff --git a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
index 2ff3cf5d..ab5b97cf 100644
--- a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
+++ b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
@@ -94,7 +94,6 @@ int main(void)
 	constexpr const auto mangled_kernel_name = "_Z9vectorAddPKfS0_Pfi";
 	auto vectorAdd = module.get_kernel(mangled_kernel_name);
 
-	// If we could rely on C++14, we would  use std::make_unique
 	auto h_A = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_B = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_C = std::unique_ptr<float[]>(new float[numElements]);
@@ -108,12 +107,12 @@ int main(void)
 	std::generate(h_A.get(), h_A.get() + numElements, generator);
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
-	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_B = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_C = cuda::memory::make_unique_span<float>(device, numElements);
 
-	cuda::memory::copy(d_A.get(), h_A.get(), size);
-	cuda::memory::copy(d_B.get(), h_B.get(), size);
+	cuda::memory::copy(d_A, h_A.get(), size);
+	cuda::memory::copy(d_B, h_B.get(), size);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(numElements)
@@ -129,7 +128,7 @@ int main(void)
 		d_A.get(), d_B.get(), d_C.get(), numElements
 	);
 
-	cuda::memory::copy(h_C.get(), d_C.get(), size);
+	cuda::memory::copy(h_C.get(), d_C, size);
 
 	// Verify that the result vector is correct
 	for (int i = 0; i < numElements; ++i) {
diff --git a/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
index c72bf32e..b5213acb 100644
--- a/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
+++ b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
@@ -34,7 +34,6 @@ int main()
 	int numElements = 50000;
 	std::cout << "[Vector addition of " << numElements << " elements]\n";
 
-	// If we could rely on C++14, we would  use std::make_unique
 	auto h_A = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_B = std::unique_ptr<float[]>(new float[numElements]);
 	auto h_C = std::unique_ptr<float[]>(new float[numElements]);
diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu
index 1d807925..a592d7e9 100644
--- a/examples/other/array_management.cu
+++ b/examples/other/array_management.cu
@@ -42,10 +42,10 @@ __global__ void from_2D_texture_to_memory_space(cudaTextureObject_t texture_sour
 } // namespace kernels
 
 template <typename T>
-void check_output_is_iota(std::string name, const T* actual, size_t length) noexcept
+void check_output_is_iota(std::string name, cuda::span<T> actual) noexcept
 {
 	bool failed { false };
-	for (size_t i = 0; i < length; ++i) {
+	for (size_t i = 0; i < actual.size(); ++i) {
 		if (actual[i] != i) {
 			if (not failed) {
 				std::cerr << name << ": Output does not matched expected values:\n";
@@ -63,10 +63,10 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 	const cuda::array::dimensions_t<3> dims = {w, h, d};
 	auto arr = cuda::array::create<float>(device, dims);
 	assert_(arr.device() == device);
-	auto ptr_in = cuda::memory::managed::make_unique<float[]>(arr.size());
-	std::iota(ptr_in.get(), ptr_in.get() + arr.size(), (float) 0.0);
-	auto ptr_out = cuda::memory::managed::make_unique<float[]>(arr.size());
-	cuda::memory::copy(arr, ptr_in.get());
+	auto span_in = cuda::memory::managed::make_unique_span<float>(arr.size());
+	std::iota(span_in.begin(), span_in.end(), (float) 0.0);
+	auto span_out = cuda::memory::managed::make_unique_span<float>(arr.size());
+	cuda::memory::copy(arr, span_in.get());
 	cuda::texture_view tv(arr);
     assert_(tv.device() == device);
 	constexpr cuda::grid::block_dimension_t block_dim = 10;
@@ -84,27 +84,27 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 	cuda::launch(
 		kernels::from_3D_texture_to_memory_space,
 		launch_config,
-		tv.raw_handle(), ptr_out.get(), w, h, d);
+		tv.raw_handle(), span_out.data(), w, h, d);
 	device.synchronize();
-	check_output_is_iota("copy from 3D texture into (managed) global memory", ptr_out.get(), arr.size());
+	check_output_is_iota("copy from 3D texture into (managed) global memory", span_out.get());
 
 	// copy between arrays and memory spaces
 	auto other_arr = cuda::array::create<float>(device, dims);
-	cuda::memory::copy(other_arr, ptr_out.get());
-	cuda::memory::copy(ptr_in.get(), other_arr);
+	cuda::memory::copy(other_arr, span_out.get());
+	cuda::memory::copy(span_in, other_arr);
 
-	check_output_is_iota("copy from (managed) global memory into a 3D array", ptr_in.get(), arr.size());
+	check_output_is_iota("copy from (managed) global memory into a 3D array", span_in.get());
 
 	// also asynchronously
 	auto stream = device.create_stream(cuda::stream::async);
-	cuda::memory::async::copy(other_arr, ptr_out.get(), stream);
-	cuda::memory::async::copy(ptr_in.get(), other_arr, stream);
+	cuda::memory::async::copy(other_arr, span_out, stream);
+	cuda::memory::async::copy(span_in, other_arr, stream);
 	device.synchronize();
-	check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", ptr_in.get(), arr.size());
+	check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in);
 }
 
 template <typename T>
-void print_2d_array(const char* title, const T* a, size_t width, size_t height)
+void print_2d_array(const char* title, cuda::span<T> a, size_t width, size_t height)
 {
     std::cout << title << ":\n";
 	for (size_t i = 0; i < height; ++i) {
@@ -119,14 +119,14 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 {
 	const cuda::array::dimensions_t<2> dims = {w, h};
 	auto arr = cuda::array::create<float>(device , dims);
-	auto ptr_in = cuda::memory::managed::make_unique<float[]>(arr.size());
-	std::iota(ptr_in.get(), ptr_in.get() + arr.size(), (float) 0);
+	auto span_in = cuda::memory::managed::make_unique_span<float>(arr.size());
+	std::iota(span_in.begin(), span_in.end(), (float) 0);
 
 	std::cout << std::endl;
 
-    print_2d_array("Data at ptr_in after initialization", ptr_in.get(), w, h);
+    print_2d_array("Data in span_in after initialization", span_in, w, h);
 
-	cuda::memory::copy(arr, ptr_in.get());
+	cuda::memory::copy(arr, span_in);
 	cuda::texture_view tv(arr);
 
 	constexpr cuda::grid::block_dimension_t block_dim = 10;
@@ -137,36 +137,36 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 		.block_dimensions(block_dim, block_dim)
 		.build();
 
-    auto ptr_out = cuda::memory::managed::make_unique<float[]>(arr.size());
+    auto span_out = cuda::memory::managed::make_unique_span<float>(arr.size());
     // The following is to make it easier to notice if nothing get copied
     // to the output
-    std::iota(ptr_out.get(), ptr_out.get() + arr.size(), (float) 90);
-//    print_2d_array("Data at ptr_out after initialization", ptr_out.get(), w, h);
+    std::iota(span_out.begin(), span_out.end() + arr.size(), (float) 90);
+//    print_2d_array("Data at span_out after initialization", span_out.get(), w, h);
 
 	cuda::launch(
 		kernels::from_2D_texture_to_memory_space,
 		launch_config,
-		tv.raw_handle(), ptr_out.get(), w, h);
-	cuda::memory::copy(ptr_out.get(), arr);
+		tv.raw_handle(), span_out.data(), w, h);
+	cuda::memory::copy(span_out, arr);
 	device.synchronize();
-	print_2d_array("Data at ptr_out after execution of 'from_2D_texture_to_memory_space'", ptr_out.get(), w, h);
+	print_2d_array("Data at span_out after execution of 'from_2D_texture_to_memory_space'", span_out, w, h);
 
-	check_output_is_iota("copy from 2D texture into (managed) global memory", ptr_out.get(), arr.size());
+	check_output_is_iota("copy from 2D texture into (managed) global memory", span_out.get());
 
 	// copy between arrays and memory spaces
 	auto other_arr = cuda::array::create<float>(device, dims);
-	cuda::memory::copy(other_arr, ptr_out.get());
-	cuda::memory::copy(ptr_in.get(), other_arr);
+	cuda::memory::copy(other_arr, span_out);
+	cuda::memory::copy(span_in, other_arr);
 
-	check_output_is_iota("copy from (managed) global memory into a 2D array", ptr_in.get(), arr.size());
+	check_output_is_iota("copy from (managed) global memory into a 2D array", span_in);
 
 	// also asynchronously
 	auto stream = cuda::stream::create(device, cuda::stream::async);
-	cuda::memory::async::copy(other_arr, ptr_out.get(), stream);
-	cuda::memory::async::copy(ptr_in.get(), other_arr, stream);
+	cuda::memory::async::copy(other_arr, span_out, stream);
+	cuda::memory::async::copy(span_in, other_arr, stream);
 	device.synchronize();
 
-	check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", ptr_in.get(), arr.size());
+	check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in);
 }
 
 int main()
diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu
index 8305387f..f53b4e9f 100644
--- a/examples/other/io_compute_overlap_with_streams.cu
+++ b/examples/other/io_compute_overlap_with_streams.cu
@@ -46,12 +46,12 @@ constexpr I div_rounding_up(I dividend, const I2 divisor) noexcept
 }
 
 struct buffer_set_t {
-    cuda::memory::host::unique_ptr<element_t[]> host_lhs;
-    cuda::memory::host::unique_ptr<element_t[]> host_rhs;
-    cuda::memory::host::unique_ptr<element_t[]> host_result;
-    cuda::memory::device::unique_ptr<element_t[]> device_lhs;
-    cuda::memory::device::unique_ptr<element_t[]> device_rhs;
-    cuda::memory::device::unique_ptr<element_t[]> device_result;
+    cuda::memory::host::unique_span<element_t> host_lhs;
+    cuda::memory::host::unique_span<element_t> host_rhs;
+    cuda::memory::host::unique_span<element_t> host_result;
+    cuda::memory::device::unique_span<element_t> device_lhs;
+    cuda::memory::device::unique_span<element_t> device_rhs;
+    cuda::memory::device::unique_span<element_t> device_result;
 };
 
 std::vector<buffer_set_t> generate_buffers(
@@ -68,12 +68,12 @@ std::vector<buffer_set_t> generate_buffers(
         [&]() {
             return buffer_set_t {
                 // Sticking to C++11 here...
-                cuda::memory::host::make_unique<element_t[]>(num_elements),
-                cuda::memory::host::make_unique<element_t[]>(num_elements),
-                cuda::memory::host::make_unique<element_t[]>(num_elements),
-                cuda::memory::make_unique<element_t[]>(device, num_elements),
-                cuda::memory::make_unique<element_t[]>(device, num_elements),
-                cuda::memory::make_unique<element_t[]>(device, num_elements)
+                cuda::memory::host::make_unique_span<element_t>(num_elements),
+                cuda::memory::host::make_unique_span<element_t>(num_elements),
+                cuda::memory::host::make_unique_span<element_t>(num_elements),
+                cuda::memory::make_unique_span<element_t>(device, num_elements),
+                cuda::memory::make_unique_span<element_t>(device, num_elements),
+                cuda::memory::make_unique_span<element_t>(device, num_elements)
             };
         }
     );
@@ -116,14 +116,14 @@ int main(int, char **)
     for(size_t k = 0; k < num_kernels; k++) {
         auto& stream = streams[k];
         auto& buffer_set = buffers[k];
-        stream.enqueue.copy(buffer_set.device_lhs.get(), buffer_set.host_lhs.get(), buffer_size);
-        stream.enqueue.copy(buffer_set.device_rhs.get(), buffer_set.host_rhs.get(), buffer_size);
+        stream.enqueue.copy(buffer_set.device_lhs, buffer_set.host_lhs);
+        stream.enqueue.copy(buffer_set.device_rhs, buffer_set.host_rhs);
         stream.enqueue.kernel_launch(
             add<element_t>,
             common_launch_config,
-            buffer_set.device_lhs.get(),
-            buffer_set.device_rhs.get(),
-            buffer_set.device_result.get(),
+            buffer_set.device_lhs.data(),
+            buffer_set.device_rhs.data(),
+            buffer_set.device_result.data(),
             num_elements);
         stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size);
 	auto callback = [=] {
diff --git a/examples/other/jitify/jitify.cpp b/examples/other/jitify/jitify.cpp
index 31c7d9de..9a0df971 100644
--- a/examples/other/jitify/jitify.cpp
+++ b/examples/other/jitify/jitify.cpp
@@ -156,13 +156,13 @@ void my_kernel(T* data) {
 	// TODO: A kernel::get(const module_t& module, const char* mangled_name function)
 	auto kernel = module.get_kernel(mangled_kernel_name);
 
-	auto d_data = cuda::memory::make_unique<T>(device);
+	auto d_data = cuda::memory::make_unique_span<T>(device, 1);
 	T h_data = 5;
-	cuda::memory::copy_single<T>(d_data.get(), &h_data);
+	cuda::memory::copy_single<T>(d_data.data(), &h_data);
 
 	auto single_thread_launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	device.launch(kernel, single_thread_launch_config, d_data.get());
-	cuda::memory::copy_single<T>(&h_data, d_data.get());
+	cuda::memory::copy_single<T>(&h_data, d_data.data());
 	return are_close(h_data, 125.f);
 }
 
@@ -205,7 +205,7 @@ void my_kernel2(float const* indata, float* outdata) {
 		// Note: In the original jitify.cpp function, there were 6 different equivalent ways to trigger an
 		// instantiation of the template. I don't see why that's at all useful, but regardless - here we
 		// instantiate by simply printing whatever is passed to the instantiation function (which is not
-		// part of the CUDA API wrappers, but is actually both straightforward and flexible).
+		// part of the CUDA API wgetrappers, but is actually both straightforward and flexible).
 	std::string source_with_instantiation = append_kernel_instantiation(program_source, my_kernel2_instantiation_name);
 	std::vector<std::pair<const char*, const char*>> headers = {
 		{"example_headers/my_header4.cuh", my_header4_cuh_contents }
@@ -242,17 +242,17 @@ void my_kernel2(float const* indata, float* outdata) {
 	auto my_kernel1 = module.get_kernel(mangled_kernel_names[0]);
 	auto my_kernel2 = module.get_kernel(mangled_kernel_names[1]);
 
-	auto indata = cuda::memory::make_unique<T>(device);
-	auto outdata = cuda::memory::make_unique<T>(device);
+	auto indata = cuda::memory::make_unique_span<T>(device, 1);
+	auto outdata = cuda::memory::make_unique_span<T>(device, 1);
 	T inval = 3.14159f;
-	cuda::memory::copy_single<T>(indata.get(), &inval);
+	cuda::memory::copy_single<T>(indata.data(), &inval);
 
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	cuda::launch(my_kernel1, launch_config, indata.get(), outdata.get());
 	cuda::launch(my_kernel2, launch_config, indata.get(), outdata.get());
 
 	T outval = 0;
-	cuda::memory::copy_single(&outval, outdata.get());
+	cuda::memory::copy_single(&outval, outdata.data());
 	// std::cout << inval << " -> " << outval << std::endl;
 	return are_close(inval, outval);
 }
@@ -308,9 +308,9 @@ __global__ void constant_test(int *x) {
 	cuda::memory::copy(a, &inval[0]);
 	cuda::memory::copy(b_a, &inval[1]);
 	cuda::memory::copy(c_b_a, &inval[2]);
-	auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
+	auto outdata = cuda::memory::make_unique_span<int>(device, n_const);
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
-	cuda::launch(kernel, launch_config, outdata.get());
+	cuda::launch(kernel, launch_config, outdata.data());
 	int outval[n_const];
 	cuda::memory::copy(outval, outdata.get(), sizeof(outval));
 
@@ -342,8 +342,8 @@ bool test_constant_2()
 	int inval[] = {3, 5, 9};
 	cuda::memory::copy(anon_b_a, inval);
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
-	auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
-	cuda::launch(kernel, launch_config, outdata.get());
+	auto outdata = cuda::memory::make_unique_span<int>(device, n_const);
+	cuda::launch(kernel, launch_config, outdata.data());
 	int outval[n_const];
 	auto ptr = outdata.get();
 	cuda::memory::copy(outval, ptr);
diff --git a/examples/other/vectorAdd_profiled.cu b/examples/other/vectorAdd_profiled.cu
index 6cf838c0..0e15f928 100644
--- a/examples/other/vectorAdd_profiled.cu
+++ b/examples/other/vectorAdd_profiled.cu
@@ -10,8 +10,6 @@
  * contact the author.
  */
 
-#include "../common.hpp"
-
 #include <cuda/api.hpp>
 #include <cuda/nvtx.hpp>
 
@@ -36,9 +34,7 @@ int main()
 	}
 
 	int numElements = 500000;
-	size_t size = numElements * sizeof(float);
 
-	// If we could rely on C++14, we would  use std::make_unique
 	auto h_A = std::unique_ptr<float>(new float[numElements]);
 	auto h_B = std::unique_ptr<float>(new float[numElements]);
 	auto h_C = std::unique_ptr<float>(new float[numElements]);
@@ -48,12 +44,12 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_B = cuda::memory::make_unique_span<float>(device, numElements);
+	auto d_C = cuda::memory::make_unique_span<float>(device, numElements);
 
-	cuda::memory::copy(d_A.get(), h_A.get(), size);
-	cuda::memory::copy(d_B.get(), h_B.get(), size);
+	cuda::memory::copy(d_A, h_A.get());
+	cuda::memory::copy(d_B, h_B.get());
 
 	// Launch the Vector Add CUDA Kernel
 
@@ -68,10 +64,10 @@ int main()
 
 	cuda::launch(vectorAdd,
 		launch_config,
-		d_A.get(), d_B.get(), d_C.get(), numElements
+		d_A.data(), d_B.data(), d_C.data(), numElements
 	);
 
-	cuda::memory::copy(h_C.get(), d_C.get(), size);
+	cuda::memory::copy(h_C.get(), d_C);
 
 	cuda::profiling::mark::range_end(profiling_range_handle);
 
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
index 21de5415..a4ec594c 100644
--- a/src/cuda/api.hpp
+++ b/src/cuda/api.hpp
@@ -33,6 +33,7 @@
 #endif
 #include "api/unique_ptr.hpp"
 #include "api/unique_region.hpp"
+#include "cuda/api/unique_span.hpp"
 #include "api/link_options.hpp"
 
 #include "api/device.hpp"
@@ -70,6 +71,7 @@
 #include "api/multi_wrapper_impls/stream.hpp"
 #include "api/multi_wrapper_impls/memory.hpp"
 #include "api/multi_wrapper_impls/unique_region.hpp"
+#include "api/multi_wrapper_impls/unique_span.hpp"
 #include "api/multi_wrapper_impls/virtual_memory.hpp"
 #include "api/multi_wrapper_impls/kernel.hpp"
 #include "api/multi_wrapper_impls/kernel_launch.hpp"
diff --git a/src/cuda/api/library.hpp b/src/cuda/api/library.hpp
index 898666f1..a03eaeb8 100644
--- a/src/cuda/api/library.hpp
+++ b/src/cuda/api/library.hpp
@@ -216,7 +216,7 @@ inline memory::region_t get_managed_region(const library_t& library, const char*
 	auto status = cuLibraryGetManaged(&region_start, &region_size, library.handle(), name);
 	throw_if_error_lazy(status, ::std::string("Failed obtaining the managed memory region '") + name
 		+ "' from " + library::detail_::identify(library));
-	return { region_start, region_size };
+	return { memory::as_pointer(region_start), region_size };
 }
 
 namespace module {
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index ff63fe87..4b6c15c2 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -34,6 +34,7 @@
 #include "pointer.hpp"
 #include "current_context.hpp"
 #include "region.hpp"
+#include "unique_span.hpp"
 
 // The following is needed for cudaGetSymbolAddress, cudaGetSymbolSize
 #include <cuda_runtime.h>
@@ -2186,6 +2187,116 @@ inline bool is_part_of_a_region_pair(const void* ptr)
 } // namespace mapped
 
 
+
+
+namespace device {
+
+template <typename T>
+using unique_span = cuda::unique_span<T, detail_::deleter>;
+
+namespace detail_ {
+
+template <typename T>
+unique_span<T> make_unique_span(const context::handle_t context_handle, size_t size)
+{
+	// Note: _Not_ asserting trivial-copy-constructibility here; so if you want to copy data
+	// to/from the device using this object - it's your own repsonsibility to ensure that's
+	// a valid thing to do.
+	CAW_SET_SCOPE_CONTEXT(context_handle);
+	return unique_span<T>{ allocate_in_current_context(size * sizeof(T)) };
+}
+
+} // namespace detail_
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory.
+ *
+ * @note CUDA's runtime API always has a current device; but -
+ * there is not necessary a current context; so a primary context
+ * for a device may be created through this call.
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param context       The CUDA device context in which to make the
+ *                      allocation.
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+*/
+template <typename T>
+unique_span<T> make_unique_span(const context_t& context, size_t size);
+template <typename T>
+unique_span<T> make_unique_span(const device_t& device, size_t size);
+template <typename T>
+unique_span<T> make_unique_span(size_t size);
+
+} // namespace device
+
+/// See @ref `device::make_unique_span(const context_t& context, size_t num_elements)`
+template <typename T>
+inline device::unique_span<T> make_unique_span(const context_t& context, size_t num_elements)
+{
+	return device::make_unique_span<T>(context, num_elements);
+}
+
+/// See @ref `device::make_unique_span(const device_t& device, size_t num_elements)`
+template <typename T>
+inline device::unique_span<T> make_unique_span(const device_t& device, size_t num_elements)
+{
+	return device::make_unique_span<T>(device, num_elements);
+}
+
+namespace host {
+
+template <typename T>
+using unique_span = cuda::unique_span<T, detail_::deleter>;
+
+template <typename T>
+unique_span<T> make_unique_span(const context_t& context, size_t size, allocation_options options = {});
+template <typename T>
+unique_span<T> make_unique_span(const device_t& device, size_t size);
+template <typename T>
+unique_span<T> make_unique_span(size_t size);
+
+} // namespace host
+
+namespace managed {
+
+template <typename T>
+using unique_span = cuda::unique_span<T, detail_::deleter>;
+
+namespace detail_ {
+
+template <typename T>
+unique_span<T> make_unique_span(
+	const context::handle_t  context_handle,
+	size_t                   size,
+	initial_visibility_t     initial_visibility = initial_visibility_t::to_all_devices)
+{
+	CAW_SET_SCOPE_CONTEXT(context_handle);
+	return unique_span<T>{ allocate_in_current_context(size * sizeof(T), initial_visibility) };
+}
+
+} // namespace detail_
+
+template <typename T>
+unique_span<T> make_unique_span(
+	const context_t&      context,
+	size_t                size,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+template <typename T>
+unique_span<T> make_unique_span(
+	const device_t&       device,
+	size_t                size,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+template <typename T>
+unique_span<T> make_unique_span(
+	size_t                size,
+	initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+
+} // namespace managed
+
 } // namespace memory
 
 namespace symbol {
diff --git a/src/cuda/api/multi_wrapper_impls/unique_span.hpp b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
new file mode 100644
index 00000000..f2b7d6f5
--- /dev/null
+++ b/src/cuda/api/multi_wrapper_impls/unique_span.hpp
@@ -0,0 +1,134 @@
+/**
+ * @file
+ *
+ * @brief Implementations of @ref `make_unique_span()` functions
+ */
+#pragma once
+#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
+#define MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
+
+#include "cuda/api/unique_span.hpp"
+#include "../current_device.hpp"
+#include "../current_context.hpp"
+#include "../primary_context.hpp"
+#include "../memory.hpp"
+#include "../types.hpp"
+
+namespace cuda {
+
+namespace memory {
+
+namespace device {
+
+template <typename T>
+unique_span<T> make_unique_span(const context_t& context, size_t num_elements)
+{
+	return detail_::make_unique_span<T>(context.handle(), num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param device        on which to construct the array of elements
+ * @param num_elements  the number of elements to allocate
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+template <typename T>
+unique_span<T> make_unique_span(const device_t& device, size_t num_elements)
+{
+	auto pc = device.primary_context();
+	CAW_SET_SCOPE_CONTEXT(pc.handle());
+	return make_unique_span<T>(pc, num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory on the current device.
+ *
+ * @note The allocation will be made in the device's primary context -
+ * which will be created if it has not yet been.
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+template <typename T>
+unique_span<T> make_unique_span(size_t num_elements)
+{
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+	return make_unique_span<T>(pc, num_elements);
+}
+
+} // namespace device
+
+namespace host {
+
+template <typename T>
+unique_span<T> make_unique_span(const context_t& context, size_t size, allocation_options options)
+{
+	CAW_SET_SCOPE_CONTEXT(context.handle());
+	return unique_span<T>{ allocate(size * sizeof(T), options) };
+}
+
+template <typename T>
+unique_span<T> make_unique_span(const device_t& device, size_t size)
+{
+	auto pc = device.primary_context();
+	return make_unique_span<T>(pc, size);
+}
+
+template <typename T>
+unique_span<T> make_unique_span(size_t size)
+{
+	return unique_span<T>{ allocate(size * sizeof(T)) };
+}
+
+} // namespace host
+
+
+namespace managed {
+
+template <typename T>
+unique_span<T> make_unique_span(
+	const context_t&      context,
+	size_t                size,
+	initial_visibility_t  initial_visibility)
+{
+	CAW_SET_SCOPE_CONTEXT(context.handle());
+	return unique_span<T>{ detail_::allocate_in_current_context(size * sizeof(T), initial_visibility) };
+}
+
+template <typename T>
+unique_span<T> make_unique_span(
+	const device_t&       device,
+	size_t                size,
+	initial_visibility_t  initial_visibility)
+{
+	auto pc = device.primary_context();
+	return make_unique_span<T>(pc, size, initial_visibility);
+}
+
+template <typename T>
+unique_span<T> make_unique_span(
+	size_t                size,
+	initial_visibility_t  initial_visibility)
+{
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+	return make_unique_span<T>(pc, size, initial_visibility);
+}
+
+} // namespace managed
+
+} // namespace memory
+
+} // namespace cuda
+
+#endif // MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
+
diff --git a/src/cuda/api/region.hpp b/src/cuda/api/region.hpp
index 0004c655..a5b956de 100644
--- a/src/cuda/api/region.hpp
+++ b/src/cuda/api/region.hpp
@@ -69,7 +69,7 @@ class base_region_t {
 			throw ::std::logic_error("Attempt to use a span of size 0 as a sequence of typed elements");
 		}
 		if (size() % sizeof(U) != 0) {
-			throw ::std::logic_error("Attempt to use a region of size not an integral multiple of the size of a type,"
+			throw ::std::logic_error("Attempt to use a region of size not an integral multiple of the size of a type, "
 				"as a span of elements of that type");
 		}
 #endif
diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp
index eb4599a4..7d482622 100644
--- a/src/cuda/api/unique_region.hpp
+++ b/src/cuda/api/unique_region.hpp
@@ -241,6 +241,7 @@ inline unique_region make_unique_region(
 } // namespace managed
 
 } // namespace memory
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_
diff --git a/src/cuda/api/unique_span.hpp b/src/cuda/api/unique_span.hpp
new file mode 100644
index 00000000..f792a1fa
--- /dev/null
+++ b/src/cuda/api/unique_span.hpp
@@ -0,0 +1,156 @@
+/**
+ * @file
+ *
+ * @brief Contains an implementation of an std::dynarray-like class, @ref `cuda::unique_span`
+ *
+ * @note There is no CUDA-specific code in this file; the class is usable entirely independently
+ * of the CUDA APIs and GPUs in general
+ *
+ */
+
+#pragma once
+#ifndef CUDA_API_WRAPPERS_UNIQUE_SPAN_HPP_
+#define CUDA_API_WRAPPERS_UNIQUE_SPAN_HPP_
+
+#include "cuda/api/detail/span.hpp"
+#include "region.hpp"
+
+#include <type_traits>
+#include <memory>
+
+namespace cuda {
+
+/**
+ * An `std::dynarry`-inspired class: Contiguous storage; size always equal to the capacity, and both
+ * set at construction time; and dynamic storage, allocated _separately_ from this class itself.
+ *
+ * @note in owning standard-library containers, allocation is tied up with the container class itself,
+ * via an Allocator template parameter. This class forgoes that "pleasure" - which is more feasible
+ * considering how re-allocation is never necessary - and simply takes the allocated memory on
+ * construction.
+ *
+ * @note unique_span is similar to the `dynarray` container, which was proposed for, but not finally
+ * included in, C++14. It can be though of as a variation on std::array, with the the size and capacity
+ * set dynamically, at construction time, rather than statically.
+ *
+ * @note unique_span = unique_span+typing or span+ownership+non_null
+ *
+ * @tparam T the type of individual elements in the unique_span
+ */
+
+template<typename T, typename Deleter = ::std::default_delete<T[]>>
+class unique_span : public ::cuda::span<T> {
+public: // span types
+	using span_type = span<T>;
+
+	// Exposing some span type definitions, strictly for terseness
+	// (they're all visible on the outside anyway)
+	using size_type = typename span<T>::size_type;
+	using pointer = typename span<T>::pointer;
+	using reference = typename span<T>::reference;
+	using deleter_type = Deleter;
+
+public: // exposing span data members
+	using span<T>::data;
+	using span<T>::size;
+
+public: // constructors and destructor
+
+	constexpr unique_span() noexcept = delete;
+
+	/// Take ownership of an existing region or span
+	///
+	/// These ctors are all explicit to prevent accidentally relinquishing ownership
+	/// when passing to a function.
+	///@{
+	explicit unique_span(span_type span) noexcept : span_type{span} { }
+	explicit unique_span(pointer data, size_type size) noexcept : unique_span{span_type{data, size}} { }
+	explicit unique_span(memory::region_t region) noexcept : span_type{region.as_span<T>()} { }
+	///@}
+
+	// Note: No constructor which also takes a deleter. We do not hold a deleter
+	// member - unlike unique_ptr's. If we wanted a general-purpose unique region
+	// that's not just GPU allocation-oriented, we might have had one of those.
+
+	/** A move constructor.
+	 *
+	 * @note Moving is the only way a unique_span may have its data_ field become null;
+	 * the user is strongly assumed not to use the unique_span after moving from it.
+	 */
+	unique_span(unique_span&& other) noexcept : unique_span{ other.release() } { }
+	// Disable copy construction - we do not allocate
+	unique_span(const unique_span&) = delete;
+
+	// Note: No conversion from "another type" like with ::std::unique_pointer, since
+	// this class is not variant with the element type; and there's not much sense in
+	// supporting conversion of memory between different deleters (/ allocators).
+
+	~unique_span() noexcept
+	{
+		if (data() != nullptr) {
+			deleter_type{}(data());
+		}
+#ifndef NDEBUG
+		static_cast<span_type&>(*this) = span_type{static_cast<T*>(nullptr), 0};
+#endif
+	}
+
+public: // operators
+
+	/// No copy-assignment - that would break our ownership guarantee
+	unique_span& operator=(const unique_span&) = delete;
+
+	/// A Move-assignment operator, which takes ownership of the other region
+	unique_span& operator=(unique_span&& other) noexcept
+	{
+		span_type released = other.release();
+		if (data() != nullptr) {
+			deleter_type{}(data());
+		}
+		data() = released.data();
+		size() = released.size();
+	}
+
+	/// No plain dereferencing - as there is no guarantee that any object has been
+	/// initialized at those locations, nor do we know its type
+
+	constexpr operator memory::const_region_t() const noexcept { return { data(), size() * sizeof(T) }; }
+
+	template<typename = typename ::std::enable_if<! ::std::is_const<T>::value>::type>
+	constexpr operator memory::region_t() const noexcept { return { data(), size() * sizeof(T) }; }
+
+	constexpr span_type get() const noexcept { return { data(), size() }; }
+
+	/// Exchange the pointer and deleter with another object.
+	void swap(unique_span& other) noexcept
+	{
+		::std::swap<span_type >(*this, other);
+	}
+
+protected: // mutators
+	/// Release ownership of any stored pointer.
+	span_type release() noexcept
+	{
+		span_type released { data(), size() };
+		static_cast<span_type &>(*this) = span_type{static_cast<T*>(nullptr), 0};
+		return released;
+	}
+}; // class unique_span
+
+/**
+ * A parallel of ::std::make_unique_for_overwrite, for @ref unique_span<T>'s, i.e. which maintains
+ * the number of elements allocated.
+ *
+ * @tparam T the type of elements in the allocated @ref unique_span.
+ *
+ * @param size The number of @tparam T elements to allocate
+ */
+template <typename T>
+unique_span<T> make_unique_span(size_t size)
+{
+	return unique_span<T>{ new T[size], size };
+}
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_UNIQUE_SPAN_HPP_