Fixed #291: Added a unique_span class - a uniquely-owned span. It s…

…hould provide the most flexibility in our example programs (and typical user programs) for direct use without explicit casting, re-specifying types and sizes, etc.
eyalroz · Apr 19, 2024 · a68a502 · a68a502
1 parent f7796de
commit a68a502
Show file tree

Hide file tree

Showing 28 changed files with 1,176 additions and 220 deletions.
diff --git a/examples/by_api_module/event_management.cu b/examples/by_api_module/event_management.cu
@@ -94,13 +94,12 @@ int main(int argc, char **argv)
  cuda::event::do_record_timings,
  cuda::event::not_interprocess);
 
- constexpr size_t buffer_size = 12345678;
- auto buffer = cuda::memory::managed::make_unique<char[]>(
- device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
+ auto buffer = cuda::memory::managed::make_unique_span<char>(
+ device, 12345678, cuda::memory::managed::initial_visibility_t::to_all_devices);
  auto wrapped_kernel = cuda::kernel::get(device, increment);
  auto launch_config = cuda::launch_config_builder()
  .kernel(&wrapped_kernel)
- .overall_size(buffer_size)
+ .overall_size(buffer.size())
  .use_maximum_linear_block()
  .build();
 
@@ -110,7 +109,7 @@ int main(int argc, char **argv)
  report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
  };
  stream.enqueue.host_invokable(first_callback);
- stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
+ stream.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer.size());
  auto second_callback = [&] {
  report_occurrence("In second callback (enqueued after the first kernel but before the second event)",
  event_1, event_2);
@@ -136,7 +135,7 @@ int main(int argc, char **argv)
  report_occurrence("After synchronizing on event_2, but before synchronizing on the stream", event_1, event_2);
  std::cout
  << cuda::event::time_elapsed_between(event_1, event_2).count() << " msec have elapsed, "
- << "executing the second kernel (\"increment\") on a buffer of " << buffer_size
+ << "executing the second kernel (\"increment\") on a buffer of " << buffer.size()
  << " chars and triggering two callbacks.\n";
  // ... and this should make the third kernel execute
  stream.synchronize();

diff --git a/examples/by_api_module/stream_management.cu b/examples/by_api_module/stream_management.cu
@@ -154,21 +154,21 @@ int main(int argc, char **argv)
 #endif
 
  constexpr auto buffer_size = 12345678;
- auto buffer = cuda::memory::managed::make_unique<char[]>(
+ auto buffer = cuda::memory::managed::make_unique_span<char>(
  buffer_size,
  device.supports_concurrent_managed_access() ?
  cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access:
  cuda::memory::managed::initial_visibility_t::to_all_devices);
- print_first_char(buffer.get());
- std::fill(buffer.get(), buffer.get() + buffer_size, 'a');
- print_first_char(buffer.get());
+ print_first_char(buffer.data());
+ std::fill(buffer.begin(), buffer.end(), 'a');
+ print_first_char(buffer.data());
 
  auto event_1 = cuda::event::create(device, cuda::event::sync_by_blocking);
  stream_1.enqueue.kernel_launch(print_message<N,3>, single_thread_config, message<N>("I'm on stream 1"));
- stream_1.enqueue.memset(buffer.get(), 'b', buffer_size);
+ stream_1.enqueue.memset(buffer, 'b');
  auto callback = [&]() {
  std::cout << "Callback from stream 1!... \n";
- print_first_char(buffer.get());
+ print_first_char(buffer.data());
  };
  stream_1.enqueue.host_invokable(callback);
  auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
@@ -177,15 +177,15 @@ int main(int argc, char **argv)
  // TODO: The following doesn't have much of a meaningful effect; we should modify this example
  // so that the attachment has some observable effect
  stream_1.enqueue.attach_managed_region(buffer.get());
- stream_1.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
+ stream_1.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer_size);
  event_1.record(stream_1);
  stream_1.enqueue.kernel_launch(print_message<N,4>, single_thread_config, message<N>("I'm on stream 1"));
  stream_2.enqueue.wait(event_1);
- stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.get());
+ stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.data());
  stream_2.enqueue.kernel_launch(print_message<N,5>, single_thread_config, message<N>("I'm on stream 2"));
  bool idleness_1 = stream_2.has_work_remaining();
  device.synchronize();
- print_first_char(buffer.get());
+ print_first_char(buffer.data());
  // cuda::memory::managed::free(buffer);
  bool idleness_2 = stream_2.has_work_remaining();
  std::cout << std::boolalpha

diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -29,12 +29,12 @@ __global__ void increment_kernel(datum*g_data, datum inc_value)
  g_data[global_idx] = g_data[global_idx] + inc_value;
 }
 
-bool correct_output(int *data, const int n, const int x)
+bool correct_output(cuda::span<const int> data, const int x)
 {
- for (int i = 0; i < n; i++)
+ for (size_t i = 0; i < data.size(); i++)
  if (data[i] != x)
  {
- printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+ printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
  return false;
  }
  return true;
@@ -51,15 +51,14 @@ int main(int, char **)
 
  std::cout << "CUDA device [" << device.name() << "]\n";
 
- int n = 16 * 1024 * 1024;
- int num_bytes = n * sizeof(datum);
+ const int n = 16 * 1024 * 1024;
  int value = 26;
 
  // allocate host memory
- auto a = cuda::memory::host::make_unique<datum[]>(n);
- cuda::memory::host::zero(a.get(), num_bytes);
+ auto a = cuda::memory::host::make_unique_span<datum>(n);
+ cuda::memory::host::zero(a);
 
- auto d_a = cuda::memory::make_unique<datum[]>(device, n);
+ auto d_a = cuda::memory::make_unique_span<datum>(device, n);
 
  auto launch_config = cuda::launch_config_builder()
  .overall_size(n)
@@ -80,9 +79,9 @@ int main(int, char **)
  auto stream = device.default_stream(); // device.create_stream(cuda::stream::async);
  auto cpu_time_start = std::chrono::high_resolution_clock::now();
  stream.enqueue.event(start_event);
- stream.enqueue.copy(d_a.get(), a.get(), num_bytes);
- stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.get(), value);
- stream.enqueue.copy(a.get(), d_a.get(), num_bytes);
+ stream.enqueue.copy(d_a, a);
+ stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value);
+ stream.enqueue.copy(a, d_a);
  stream.enqueue.event(end_event);
  auto cpu_time_end = std::chrono::high_resolution_clock::now();
 
@@ -99,7 +98,7 @@ int main(int, char **)
  std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n';
  std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n";
 
- auto bFinalResults = correct_output(a.get(), n, value);
+ auto bFinalResults = correct_output(a, value);
 
  std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n';
 

diff --git a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
@@ -89,17 +89,17 @@ int main()
  std::unique_ptr<float[]>(new float[nElements])
  );
 
- auto device_buffer = cuda::memory::device::make_unique<float[]>(nElements);
+ auto device_buffer = cuda::memory::device::make_unique_span<float>(nElements);
 
  auto pinned_host_buffers = std::make_pair(
- cuda::memory::host::make_unique<float[]>(nElements),
- cuda::memory::host::make_unique<float[]>(nElements)
+ cuda::memory::host::make_unique_span<float>(nElements),
+ cuda::memory::host::make_unique_span<float>(nElements)
  );
 
  auto h_aPageable = pageable_host_buffers.first.get();
  auto h_bPageable = pageable_host_buffers.second.get();
- auto h_aPinned = pinned_host_buffers.first.get();
- auto h_bPinned = pinned_host_buffers.second.get();
+ auto h_aPinned = pinned_host_buffers.first.data();
+ auto h_bPinned = pinned_host_buffers.second.data();
 
  std::iota(h_aPageable, h_aPageable + nElements, 0.0);
  cuda::memory::copy(h_aPinned, h_aPageable, bytes);
@@ -112,6 +112,6 @@ int main()
  std::cout << "\nTransfer size (MB): " << (bytes / Mi) << "\n";
 
  // perform copies and report bandwidth
- profileCopies(h_aPageable, h_bPageable, device_buffer.get(), nElements, "Pageable");
- profileCopies(h_aPinned, h_bPinned, device_buffer.get(), nElements, "Pinned");
+ profileCopies(h_aPageable, h_bPageable, device_buffer.data(), nElements, "Pageable");
+ profileCopies(h_aPinned, h_bPinned, device_buffer.data(), nElements, "Pinned");
 }
diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
@@ -154,20 +154,20 @@ int main()
  {
  const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block;
 
- auto d_input = cuda::memory::make_unique<float[]>(device, input_size);
- auto d_output = cuda::memory::make_unique<float[]>(device, num_blocks);
+ auto d_input = cuda::memory::make_unique_span<float>(device, input_size);
+ auto d_output = cuda::memory::make_unique_span<float>(device, num_blocks);
  // Note: We won't actually be checking the output...
- auto d_timers = cuda::memory::make_unique<clock_t []>(device, num_timers);
- cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float));
+ auto d_timers = cuda::memory::make_unique_span<clock_t>(device, num_timers);
+ cuda::memory::copy(d_input, input.get());
 
  auto launch_config = cuda::launch_config_builder()
  .num_blocks(num_blocks)
  .block_size(num_threads_per_block)
  .dynamic_shared_memory_size(dynamic_shared_mem_size)
  .build();
- cuda::launch(kernel_in_module, launch_config, d_input.get(), d_output.get(), d_timers.get());
+ cuda::launch(kernel_in_module, launch_config, d_input.data(), d_output.data(), d_timers.data());
  device.synchronize();
- cuda::memory::copy(timers.get(), d_timers.get(), num_timers * sizeof(clock_t));
+ cuda::memory::copy(timers.get(), d_timers);
  } // The allocated device buffers are released here
  long double average_elapsed_clock_ticks_per_block = compute_average_elapsed_clocks(timers.get(), num_blocks);