Skip to content

Commit

Permalink
Regards #291, #322: Added a unique_span class - a uniquely-owned sp…
Browse files Browse the repository at this point in the history
…an. It should provide the most flexibility in our example programs (and typical user programs) for direct use without explicit casting, re-specifying types and sizes, etc.
  • Loading branch information
eyalroz authored and eyalroz-gehc committed Feb 13, 2024
1 parent 7a41a7e commit fe58fa5
Show file tree
Hide file tree
Showing 6 changed files with 412 additions and 20 deletions.
23 changes: 11 additions & 12 deletions examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ __global__ void increment_kernel(datum*g_data, datum inc_value)
g_data[global_idx] = g_data[global_idx] + inc_value;
}

bool correct_output(int *data, const int n, const int x)
bool correct_output(cuda::span<const int> data, const int x)
{
for (int i = 0; i < n; i++)
for (size_t i = 0; i < data.size(); i++)
if (data[i] != x)
{
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
Expand All @@ -51,15 +51,14 @@ int main(int, char **)

std::cout << "CUDA device [" << device.name() << "]\n";

int n = 16 * 1024 * 1024;
int num_bytes = n * sizeof(datum);
const int n = 16 * 1024 * 1024;
int value = 26;

// allocate host memory
auto a = cuda::memory::host::make_unique<datum[]>(n);
cuda::memory::host::zero(a.get(), num_bytes);
auto a = cuda::memory::host::make_unique_span<datum>(n);
cuda::memory::host::zero(a);

auto d_a = cuda::memory::make_unique<datum[]>(device, n);
auto d_a = cuda::memory::make_unique_span<datum>(device, n);

auto launch_config = cuda::launch_config_builder()
.overall_size(n)
Expand All @@ -80,9 +79,9 @@ int main(int, char **)
auto stream = device.default_stream(); // device.create_stream(cuda::stream::async);
auto cpu_time_start = std::chrono::high_resolution_clock::now();
stream.enqueue.event(start_event);
stream.enqueue.copy(d_a.get(), a.get(), num_bytes);
stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.get(), value);
stream.enqueue.copy(a.get(), d_a.get(), num_bytes);
stream.enqueue.copy(d_a, a);
stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value);
stream.enqueue.copy(a, d_a);
stream.enqueue.event(end_event);
auto cpu_time_end = std::chrono::high_resolution_clock::now();

Expand All @@ -99,7 +98,7 @@ int main(int, char **)
std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n';
std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n";

auto bFinalResults = correct_output(a.get(), n, value);
auto bFinalResults = correct_output(a, value);

std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n';

Expand Down
15 changes: 7 additions & 8 deletions examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ int main()
}

int numElements = 50000;
size_t size = numElements * sizeof(float);
std::cout << "[Vector addition of " << numElements << " elements]\n";

// If we could rely on C++14, we would use std::make_unique
Expand All @@ -43,12 +42,12 @@ int main()
std::generate(h_B.get(), h_B.get() + numElements, generator);

auto device = cuda::device::current::get();
auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
auto d_A = cuda::memory::make_unique_span<float>(device, numElements);
auto d_B = cuda::memory::make_unique_span<float>(device, numElements);
auto d_C = cuda::memory::make_unique_span<float>(device, numElements);

cuda::memory::copy(d_A.get(), h_A.get(), size);
cuda::memory::copy(d_B.get(), h_B.get(), size);
cuda::memory::copy(d_A, h_A.get());
cuda::memory::copy(d_B, h_B.get());

auto launch_config = cuda::launch_config_builder()
.overall_size(numElements)
Expand All @@ -61,10 +60,10 @@ int main()

cuda::launch(
vectorAdd, launch_config,
d_A.get(), d_B.get(), d_C.get(), numElements
d_A.data(), d_B.data(), d_C.data(), numElements
);

cuda::memory::copy(h_C.get(), d_C.get(), size);
cuda::memory::copy(h_C.get(), d_C);

// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
Expand Down
2 changes: 2 additions & 0 deletions src/cuda/api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#endif
#include "api/unique_ptr.hpp"
#include "api/unique_region.hpp"
#include "api/unique_span.hpp"
#include "api/link_options.hpp"

#include "api/device.hpp"
Expand Down Expand Up @@ -69,6 +70,7 @@
#include "api/multi_wrapper_impls/stream.hpp"
#include "api/multi_wrapper_impls/memory.hpp"
#include "api/multi_wrapper_impls/unique_region.hpp"
#include "api/multi_wrapper_impls/unique_span.hpp"
#include "api/multi_wrapper_impls/virtual_memory.hpp"
#include "api/multi_wrapper_impls/kernel.hpp"
#include "api/multi_wrapper_impls/kernel_launch.hpp"
Expand Down
109 changes: 109 additions & 0 deletions src/cuda/api/multi_wrapper_impls/unique_span.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/**
* @file
*
* @brief Implementations of @ref `make_unique_span()` functions
*/
#pragma once
#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_
#define MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_

#include "../unique_span.hpp"
#include "../current_device.hpp"
#include "../current_context.hpp"
#include "../primary_context.hpp"
#include "../memory.hpp"
#include "../types.hpp"

namespace cuda {

namespace memory {

namespace device {

template <typename T>
unique_span<T> make_unique_span(const context_t& context, cuda::size_t num_elements)
{
return detail_::make_unique_span<T>(context.handle(), num_elements);
}

/**
* @brief Create a variant of ::std::unique_pointer for an array in
* device-global memory
*
* @tparam T an array type; _not_ the type of individual elements
*
* @param device on which to construct the array of elements
* @param num_elements the number of elements to allocate
* @return an ::std::unique_ptr pointing to the constructed T array
*/
template <typename T>
unique_span<T> make_unique_span(const device_t& device, size_t num_elements)
{
auto pc = device.primary_context();
CAW_SET_SCOPE_CONTEXT(pc.handle());
return make_unique_span<T>(pc, num_elements);
}

/**
* @brief Create a variant of ::std::unique_pointer for an array in
* device-global memory on the current device.
*
* @note The allocation will be made in the device's primary context -
* which will be created if it has not yet been.
*
* @tparam T an array type; _not_ the type of individual elements
*
* @param num_elements the number of elements to allocate
*
* @return an ::std::unique_ptr pointing to the constructed T array
*/
template <typename T>
unique_span<T> make_unique_span(size_t num_elements)
{
auto current_device_id = cuda::device::current::detail_::get_id();
auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
return make_unique_span<T>(pc, num_elements);
}

} // namespace device

namespace managed {

template <typename T>
unique_span<T> make_unique_span(
const context_t& context,
size_t num_bytes,
initial_visibility_t initial_visibility)
{
CAW_SET_SCOPE_CONTEXT(context.handle());
return unique_span<T>{ detail_::allocate_in_current_context(num_bytes, initial_visibility) };
}

template <typename T>
unique_span<T> make_unique_span(
const device_t& device,
size_t num_bytes,
initial_visibility_t initial_visibility)
{
auto pc = device.primary_context();
return make_unique_span<T>(pc, num_bytes, initial_visibility);
}

template <typename T>
unique_span<T> make_unique_span(
size_t num_bytes,
initial_visibility_t initial_visibility)
{
auto current_device_id = cuda::device::current::detail_::get_id();
auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
return make_unique_span<T>(pc, num_bytes, initial_visibility);
}

} // namespace managed

} // namespace memory

} // namespace cuda

#endif // MULTI_WRAPPER_IMPLS_UNIQUE_SPAN_HPP_

1 change: 1 addition & 0 deletions src/cuda/api/unique_region.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ inline unique_region make_unique_region(
} // namespace managed

} // namespace memory

} // namespace cuda

#endif // CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_
Loading

0 comments on commit fe58fa5

Please sign in to comment.