-
-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Regards #291: Added support for untyped, owned, memory regions:
* Added a `unique_region` class - non-templated * `unique_region`'s "decay" into `cuda::memory::region_t`'s - so there's no need to reimplement half the world for them * Added a missing variant of `cuda::memory::copy()` * Added an example program based on vectorAdd, which uses unique_regions (I would have replaced vectorAdd with it - had it not requirted to lines of source more than before...) * Removed some commented-out unique-pointer-related code
- Loading branch information
Showing
8 changed files
with
481 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
82 changes: 82 additions & 0 deletions
82
examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
/** | ||
* Derived from the nVIDIA CUDA 8.0 samples by | ||
* | ||
* Eyal Rozenberg | ||
* | ||
* The derivation is specifically permitted in the nVIDIA CUDA Samples EULA | ||
* and the deriver is the owner of this code according to the EULA. | ||
* | ||
* Use this reasonably. If you want to discuss licensing formalities, please | ||
* contact the author. | ||
*/ | ||
|
||
#include <cuda/api.hpp> | ||
|
||
#include <iostream> | ||
#include <memory> | ||
#include <algorithm> | ||
|
||
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) | ||
{ | ||
int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (i < numElements) { C[i] = A[i] + B[i]; } | ||
} | ||
|
||
int main() | ||
{ | ||
if (cuda::device::count() == 0) { | ||
std::cerr << "No CUDA devices on this system" << "\n"; | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
int numElements = 50000; | ||
std::cout << "[Vector addition of " << numElements << " elements]\n"; | ||
|
||
// If we could rely on C++14, we would use std::make_unique | ||
auto h_A = std::unique_ptr<float[]>(new float[numElements]); | ||
auto h_B = std::unique_ptr<float[]>(new float[numElements]); | ||
auto h_C = std::unique_ptr<float[]>(new float[numElements]); | ||
|
||
auto generator = []() { return rand() / (float) RAND_MAX; }; | ||
std::generate(h_A.get(), h_A.get() + numElements, generator); | ||
std::generate(h_B.get(), h_B.get() + numElements, generator); | ||
|
||
auto device = cuda::device::current::get(); | ||
auto d_A = cuda::memory::make_unique_region(device, numElements); | ||
auto d_B = cuda::memory::make_unique_region(device, numElements); | ||
auto d_C = cuda::memory::make_unique_region(device, numElements); | ||
auto sp_A = d_A.as_span<float>(); | ||
auto sp_B = d_A.as_span<float>(); | ||
auto sp_C = d_A.as_span<float>(); | ||
|
||
cuda::memory::copy(sp_A, h_A.get()); | ||
cuda::memory::copy(sp_B, h_B.get()); | ||
|
||
auto launch_config = cuda::launch_config_builder() | ||
.overall_size(numElements) | ||
.block_size(256) | ||
.build(); | ||
|
||
std::cout | ||
<< "CUDA kernel launch with " << launch_config.dimensions.grid.x | ||
<< " blocks of " << launch_config.dimensions.block.x << " threads each\n"; | ||
|
||
cuda::launch( | ||
vectorAdd, launch_config, | ||
sp_A.data(), sp_B.data(), sp_C.data(), numElements | ||
); | ||
|
||
cuda::memory::copy(h_C.get(), d_C); | ||
|
||
// Verify that the result vector is correct | ||
for (int i = 0; i < numElements; ++i) { | ||
if (fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5) { | ||
std::cerr << "Result verification failed at element " << i << "\n"; | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
std::cout << "Test PASSED\n"; | ||
std::cout << "SUCCESS\n"; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/** | ||
* @file | ||
* | ||
* @brief Implementations of @ref `make_unique_region()` functions | ||
*/ | ||
#pragma once | ||
#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ | ||
#define MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ | ||
|
||
#include "../unique_region.hpp" | ||
#include "../types.hpp" | ||
|
||
namespace cuda { | ||
|
||
namespace memory { | ||
|
||
namespace device { | ||
|
||
inline unique_region make_unique_region(const context_t& context, cuda::size_t num_elements) | ||
{ | ||
return detail_::make_unique_region(context.handle(), num_elements); | ||
} | ||
|
||
/** | ||
* @brief Create a variant of ::std::unique_pointer for an array in | ||
* device-global memory | ||
* | ||
* @tparam T an array type; _not_ the type of individual elements | ||
* | ||
* @param device on which to construct the array of elements | ||
* @param num_elements the number of elements to allocate | ||
* @return an ::std::unique_ptr pointing to the constructed T array | ||
*/ | ||
inline unique_region make_unique_region(const device_t& device, size_t num_elements) | ||
{ | ||
auto pc = device.primary_context(); | ||
CAW_SET_SCOPE_CONTEXT(pc.handle()); | ||
return make_unique_region(pc, num_elements); | ||
} | ||
|
||
/** | ||
* @brief Create a variant of ::std::unique_pointer for an array in | ||
* device-global memory on the current device. | ||
* | ||
* @note The allocation will be made in the device's primary context - | ||
* which will be created if it has not yet been. | ||
* | ||
* @tparam T an array type; _not_ the type of individual elements | ||
* | ||
* @param num_elements the number of elements to allocate | ||
* | ||
* @return an ::std::unique_ptr pointing to the constructed T array | ||
*/ | ||
inline unique_region make_unique_region(size_t num_elements) | ||
{ | ||
auto current_device_id = cuda::device::current::detail_::get_id(); | ||
auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id); | ||
return make_unique_region(pc, num_elements); | ||
} | ||
|
||
} // namespace device | ||
|
||
namespace managed { | ||
|
||
inline unique_region make_unique_region( | ||
const context_t& context, | ||
size_t num_bytes, | ||
initial_visibility_t initial_visibility) | ||
{ | ||
CAW_SET_SCOPE_CONTEXT(context.handle()); | ||
return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) }; | ||
} | ||
|
||
inline unique_region make_unique_region( | ||
const device_t& device, | ||
size_t num_bytes, | ||
initial_visibility_t initial_visibility) | ||
{ | ||
auto pc = device.primary_context(); | ||
return make_unique_region(pc, num_bytes, initial_visibility); | ||
} | ||
|
||
inline unique_region make_unique_region( | ||
size_t num_bytes, | ||
initial_visibility_t initial_visibility) | ||
{ | ||
auto current_device_id = cuda::device::current::detail_::get_id(); | ||
auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id); | ||
return make_unique_region(pc, num_bytes, initial_visibility); | ||
} | ||
|
||
} // namespace managed | ||
|
||
} // namespace memory | ||
|
||
} // namespace cuda | ||
|
||
#endif // MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.