Skip to content

Commit

Permalink
Fixed #291: Added a unique_span class - a uniquely-owned span. It s…
Browse files Browse the repository at this point in the history
…hould provide the most flexibility in our example programs (and typical user programs) for direct use without explicit casting, re-specifying types and sizes, etc.
  • Loading branch information
eyalroz committed Apr 19, 2024
1 parent f7796de commit a68a502
Show file tree
Hide file tree
Showing 28 changed files with 1,176 additions and 220 deletions.
11 changes: 5 additions & 6 deletions examples/by_api_module/event_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,12 @@ int main(int argc, char **argv)
cuda::event::do_record_timings,
cuda::event::not_interprocess);

constexpr size_t buffer_size = 12345678;
auto buffer = cuda::memory::managed::make_unique<char[]>(
device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices);
auto buffer = cuda::memory::managed::make_unique_span<char>(
device, 12345678, cuda::memory::managed::initial_visibility_t::to_all_devices);
auto wrapped_kernel = cuda::kernel::get(device, increment);
auto launch_config = cuda::launch_config_builder()
.kernel(&wrapped_kernel)
.overall_size(buffer_size)
.overall_size(buffer.size())
.use_maximum_linear_block()
.build();

Expand All @@ -110,7 +109,7 @@ int main(int argc, char **argv)
report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
};
stream.enqueue.host_invokable(first_callback);
stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
stream.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer.size());
auto second_callback = [&] {
report_occurrence("In second callback (enqueued after the first kernel but before the second event)",
event_1, event_2);
Expand All @@ -136,7 +135,7 @@ int main(int argc, char **argv)
report_occurrence("After synchronizing on event_2, but before synchronizing on the stream", event_1, event_2);
std::cout
<< cuda::event::time_elapsed_between(event_1, event_2).count() << " msec have elapsed, "
<< "executing the second kernel (\"increment\") on a buffer of " << buffer_size
<< "executing the second kernel (\"increment\") on a buffer of " << buffer.size()
<< " chars and triggering two callbacks.\n";
// ... and this should make the third kernel execute
stream.synchronize();
Expand Down
18 changes: 9 additions & 9 deletions examples/by_api_module/stream_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -154,21 +154,21 @@ int main(int argc, char **argv)
#endif

constexpr auto buffer_size = 12345678;
auto buffer = cuda::memory::managed::make_unique<char[]>(
auto buffer = cuda::memory::managed::make_unique_span<char>(
buffer_size,
device.supports_concurrent_managed_access() ?
cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access:
cuda::memory::managed::initial_visibility_t::to_all_devices);
print_first_char(buffer.get());
std::fill(buffer.get(), buffer.get() + buffer_size, 'a');
print_first_char(buffer.get());
print_first_char(buffer.data());
std::fill(buffer.begin(), buffer.end(), 'a');
print_first_char(buffer.data());

auto event_1 = cuda::event::create(device, cuda::event::sync_by_blocking);
stream_1.enqueue.kernel_launch(print_message<N,3>, single_thread_config, message<N>("I'm on stream 1"));
stream_1.enqueue.memset(buffer.get(), 'b', buffer_size);
stream_1.enqueue.memset(buffer, 'b');
auto callback = [&]() {
std::cout << "Callback from stream 1!... \n";
print_first_char(buffer.get());
print_first_char(buffer.data());
};
stream_1.enqueue.host_invokable(callback);
auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
Expand All @@ -177,15 +177,15 @@ int main(int argc, char **argv)
// TODO: The following doesn't have much of a meaningful effect; we should modify this example
// so that the attachment has some observable effect
stream_1.enqueue.attach_managed_region(buffer.get());
stream_1.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
stream_1.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer_size);
event_1.record(stream_1);
stream_1.enqueue.kernel_launch(print_message<N,4>, single_thread_config, message<N>("I'm on stream 1"));
stream_2.enqueue.wait(event_1);
stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.get());
stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.data());
stream_2.enqueue.kernel_launch(print_message<N,5>, single_thread_config, message<N>("I'm on stream 2"));
bool idleness_1 = stream_2.has_work_remaining();
device.synchronize();
print_first_char(buffer.get());
print_first_char(buffer.data());
// cuda::memory::managed::free(buffer);
bool idleness_2 = stream_2.has_work_remaining();
std::cout << std::boolalpha
Expand Down
23 changes: 11 additions & 12 deletions examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ __global__ void increment_kernel(datum*g_data, datum inc_value)
g_data[global_idx] = g_data[global_idx] + inc_value;
}

bool correct_output(int *data, const int n, const int x)
bool correct_output(cuda::span<const int> data, const int x)
{
for (int i = 0; i < n; i++)
for (size_t i = 0; i < data.size(); i++)
if (data[i] != x)
{
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
Expand All @@ -51,15 +51,14 @@ int main(int, char **)

std::cout << "CUDA device [" << device.name() << "]\n";

int n = 16 * 1024 * 1024;
int num_bytes = n * sizeof(datum);
const int n = 16 * 1024 * 1024;
int value = 26;

// allocate host memory
auto a = cuda::memory::host::make_unique<datum[]>(n);
cuda::memory::host::zero(a.get(), num_bytes);
auto a = cuda::memory::host::make_unique_span<datum>(n);
cuda::memory::host::zero(a);

auto d_a = cuda::memory::make_unique<datum[]>(device, n);
auto d_a = cuda::memory::make_unique_span<datum>(device, n);

auto launch_config = cuda::launch_config_builder()
.overall_size(n)
Expand All @@ -80,9 +79,9 @@ int main(int, char **)
auto stream = device.default_stream(); // device.create_stream(cuda::stream::async);
auto cpu_time_start = std::chrono::high_resolution_clock::now();
stream.enqueue.event(start_event);
stream.enqueue.copy(d_a.get(), a.get(), num_bytes);
stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.get(), value);
stream.enqueue.copy(a.get(), d_a.get(), num_bytes);
stream.enqueue.copy(d_a, a);
stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value);
stream.enqueue.copy(a, d_a);
stream.enqueue.event(end_event);
auto cpu_time_end = std::chrono::high_resolution_clock::now();

Expand All @@ -99,7 +98,7 @@ int main(int, char **)
std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n';
std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n";

auto bFinalResults = correct_output(a.get(), n, value);
auto bFinalResults = correct_output(a, value);

std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n';

Expand Down
14 changes: 7 additions & 7 deletions examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,17 @@ int main()
std::unique_ptr<float[]>(new float[nElements])
);

auto device_buffer = cuda::memory::device::make_unique<float[]>(nElements);
auto device_buffer = cuda::memory::device::make_unique_span<float>(nElements);

auto pinned_host_buffers = std::make_pair(
cuda::memory::host::make_unique<float[]>(nElements),
cuda::memory::host::make_unique<float[]>(nElements)
cuda::memory::host::make_unique_span<float>(nElements),
cuda::memory::host::make_unique_span<float>(nElements)
);

auto h_aPageable = pageable_host_buffers.first.get();
auto h_bPageable = pageable_host_buffers.second.get();
auto h_aPinned = pinned_host_buffers.first.get();
auto h_bPinned = pinned_host_buffers.second.get();
auto h_aPinned = pinned_host_buffers.first.data();
auto h_bPinned = pinned_host_buffers.second.data();

std::iota(h_aPageable, h_aPageable + nElements, 0.0);
cuda::memory::copy(h_aPinned, h_aPageable, bytes);
Expand All @@ -112,6 +112,6 @@ int main()
std::cout << "\nTransfer size (MB): " << (bytes / Mi) << "\n";

// perform copies and report bandwidth
profileCopies(h_aPageable, h_bPageable, device_buffer.get(), nElements, "Pageable");
profileCopies(h_aPinned, h_bPinned, device_buffer.get(), nElements, "Pinned");
profileCopies(h_aPageable, h_bPageable, device_buffer.data(), nElements, "Pageable");
profileCopies(h_aPinned, h_bPinned, device_buffer.data(), nElements, "Pinned");
}
12 changes: 6 additions & 6 deletions examples/modified_cuda_samples/clock_nvrtc/clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,20 +154,20 @@ int main()
{
const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block;

auto d_input = cuda::memory::make_unique<float[]>(device, input_size);
auto d_output = cuda::memory::make_unique<float[]>(device, num_blocks);
auto d_input = cuda::memory::make_unique_span<float>(device, input_size);
auto d_output = cuda::memory::make_unique_span<float>(device, num_blocks);
// Note: We won't actually be checking the output...
auto d_timers = cuda::memory::make_unique<clock_t []>(device, num_timers);
cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float));
auto d_timers = cuda::memory::make_unique_span<clock_t>(device, num_timers);
cuda::memory::copy(d_input, input.get());

auto launch_config = cuda::launch_config_builder()
.num_blocks(num_blocks)
.block_size(num_threads_per_block)
.dynamic_shared_memory_size(dynamic_shared_mem_size)
.build();
cuda::launch(kernel_in_module, launch_config, d_input.get(), d_output.get(), d_timers.get());
cuda::launch(kernel_in_module, launch_config, d_input.data(), d_output.data(), d_timers.data());
device.synchronize();
cuda::memory::copy(timers.get(), d_timers.get(), num_timers * sizeof(clock_t));
cuda::memory::copy(timers.get(), d_timers);
} // The allocated device buffers are released here
long double average_elapsed_clock_ticks_per_block = compute_average_elapsed_clocks(timers.get(), num_blocks);

Expand Down
Loading

0 comments on commit a68a502

Please sign in to comment.