diff --git a/examples/by_api_module/event_management.cu b/examples/by_api_module/event_management.cu index da959c6e..44ac9af5 100644 --- a/examples/by_api_module/event_management.cu +++ b/examples/by_api_module/event_management.cu @@ -94,13 +94,12 @@ int main(int argc, char **argv) cuda::event::do_record_timings, cuda::event::not_interprocess); - constexpr size_t buffer_size = 12345678; - auto buffer = cuda::memory::managed::make_unique( - device, buffer_size, cuda::memory::managed::initial_visibility_t::to_all_devices); + auto buffer = cuda::memory::managed::make_unique_span( + device, 12345678, cuda::memory::managed::initial_visibility_t::to_all_devices); auto wrapped_kernel = cuda::kernel::get(device, increment); auto launch_config = cuda::launch_config_builder() .kernel(&wrapped_kernel) - .overall_size(buffer_size) + .overall_size(buffer.size()) .use_maximum_linear_block() .build(); @@ -110,7 +109,7 @@ int main(int argc, char **argv) report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2); }; stream.enqueue.host_invokable(first_callback); - stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size); + stream.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer.size()); auto second_callback = [&] { report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2); @@ -136,7 +135,7 @@ int main(int argc, char **argv) report_occurrence("After synchronizing on event_2, but before synchronizing on the stream", event_1, event_2); std::cout << cuda::event::time_elapsed_between(event_1, event_2).count() << " msec have elapsed, " - << "executing the second kernel (\"increment\") on a buffer of " << buffer_size + << "executing the second kernel (\"increment\") on a buffer of " << buffer.size() << " chars and triggering two callbacks.\n"; // ... and this should make the third kernel execute stream.synchronize(); diff --git a/examples/by_api_module/stream_management.cu b/examples/by_api_module/stream_management.cu index f03b8120..5583c392 100644 --- a/examples/by_api_module/stream_management.cu +++ b/examples/by_api_module/stream_management.cu @@ -154,21 +154,21 @@ int main(int argc, char **argv) #endif constexpr auto buffer_size = 12345678; - auto buffer = cuda::memory::managed::make_unique( + auto buffer = cuda::memory::managed::make_unique_span( buffer_size, device.supports_concurrent_managed_access() ? cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access: cuda::memory::managed::initial_visibility_t::to_all_devices); - print_first_char(buffer.get()); - std::fill(buffer.get(), buffer.get() + buffer_size, 'a'); - print_first_char(buffer.get()); + print_first_char(buffer.data()); + std::fill(buffer.begin(), buffer.end(), 'a'); + print_first_char(buffer.data()); auto event_1 = cuda::event::create(device, cuda::event::sync_by_blocking); stream_1.enqueue.kernel_launch(print_message, single_thread_config, message("I'm on stream 1")); - stream_1.enqueue.memset(buffer.get(), 'b', buffer_size); + stream_1.enqueue.memset(buffer, 'b'); auto callback = [&]() { std::cout << "Callback from stream 1!... \n"; - print_first_char(buffer.get()); + print_first_char(buffer.data()); }; stream_1.enqueue.host_invokable(callback); auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK); @@ -177,15 +177,15 @@ int main(int argc, char **argv) // TODO: The following doesn't have much of a meaningful effect; we should modify this example // so that the attachment has some observable effect stream_1.enqueue.attach_managed_region(buffer.get()); - stream_1.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size); + stream_1.enqueue.kernel_launch(increment, launch_config, buffer.data(), buffer_size); event_1.record(stream_1); stream_1.enqueue.kernel_launch(print_message, single_thread_config, message("I'm on stream 1")); stream_2.enqueue.wait(event_1); - stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.get()); + stream_2.enqueue.kernel_launch(print_first_char_kernel, launch_config , buffer.data()); stream_2.enqueue.kernel_launch(print_message, single_thread_config, message("I'm on stream 2")); bool idleness_1 = stream_2.has_work_remaining(); device.synchronize(); - print_first_char(buffer.get()); + print_first_char(buffer.data()); // cuda::memory::managed::free(buffer); bool idleness_2 = stream_2.has_work_remaining(); std::cout << std::boolalpha diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu index 996f29b2..f578cfdc 100644 --- a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu +++ b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu @@ -29,12 +29,12 @@ __global__ void increment_kernel(datum*g_data, datum inc_value) g_data[global_idx] = g_data[global_idx] + inc_value; } -bool correct_output(int *data, const int n, const int x) +bool correct_output(cuda::span data, const int x) { - for (int i = 0; i < n; i++) + for (size_t i = 0; i < data.size(); i++) if (data[i] != x) { - printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); + printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x); return false; } return true; @@ -51,15 +51,14 @@ int main(int, char **) std::cout << "CUDA device [" << device.name() << "]\n"; - int n = 16 * 1024 * 1024; - int num_bytes = n * sizeof(datum); + const int n = 16 * 1024 * 1024; int value = 26; // allocate host memory - auto a = cuda::memory::host::make_unique(n); - cuda::memory::host::zero(a.get(), num_bytes); + auto a = cuda::memory::host::make_unique_span(n); + cuda::memory::host::zero(a); - auto d_a = cuda::memory::make_unique(device, n); + auto d_a = cuda::memory::make_unique_span(device, n); auto launch_config = cuda::launch_config_builder() .overall_size(n) @@ -80,9 +79,9 @@ int main(int, char **) auto stream = device.default_stream(); // device.create_stream(cuda::stream::async); auto cpu_time_start = std::chrono::high_resolution_clock::now(); stream.enqueue.event(start_event); - stream.enqueue.copy(d_a.get(), a.get(), num_bytes); - stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.get(), value); - stream.enqueue.copy(a.get(), d_a.get(), num_bytes); + stream.enqueue.copy(d_a, a); + stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value); + stream.enqueue.copy(a, d_a); stream.enqueue.event(end_event); auto cpu_time_end = std::chrono::high_resolution_clock::now(); @@ -99,7 +98,7 @@ int main(int, char **) std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n'; std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n"; - auto bFinalResults = correct_output(a.get(), n, value); + auto bFinalResults = correct_output(a, value); std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n'; diff --git a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp index bba9ac0f..d21c91c5 100644 --- a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp +++ b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp @@ -89,17 +89,17 @@ int main() std::unique_ptr(new float[nElements]) ); - auto device_buffer = cuda::memory::device::make_unique(nElements); + auto device_buffer = cuda::memory::device::make_unique_span(nElements); auto pinned_host_buffers = std::make_pair( - cuda::memory::host::make_unique(nElements), - cuda::memory::host::make_unique(nElements) + cuda::memory::host::make_unique_span(nElements), + cuda::memory::host::make_unique_span(nElements) ); auto h_aPageable = pageable_host_buffers.first.get(); auto h_bPageable = pageable_host_buffers.second.get(); - auto h_aPinned = pinned_host_buffers.first.get(); - auto h_bPinned = pinned_host_buffers.second.get(); + auto h_aPinned = pinned_host_buffers.first.data(); + auto h_bPinned = pinned_host_buffers.second.data(); std::iota(h_aPageable, h_aPageable + nElements, 0.0); cuda::memory::copy(h_aPinned, h_aPageable, bytes); @@ -112,6 +112,6 @@ int main() std::cout << "\nTransfer size (MB): " << (bytes / Mi) << "\n"; // perform copies and report bandwidth - profileCopies(h_aPageable, h_bPageable, device_buffer.get(), nElements, "Pageable"); - profileCopies(h_aPinned, h_bPinned, device_buffer.get(), nElements, "Pinned"); + profileCopies(h_aPageable, h_bPageable, device_buffer.data(), nElements, "Pageable"); + profileCopies(h_aPinned, h_bPinned, device_buffer.data(), nElements, "Pinned"); } diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp index 633d37dc..5f347b64 100644 --- a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp +++ b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp @@ -154,20 +154,20 @@ int main() { const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block; - auto d_input = cuda::memory::make_unique(device, input_size); - auto d_output = cuda::memory::make_unique(device, num_blocks); + auto d_input = cuda::memory::make_unique_span(device, input_size); + auto d_output = cuda::memory::make_unique_span(device, num_blocks); // Note: We won't actually be checking the output... - auto d_timers = cuda::memory::make_unique(device, num_timers); - cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float)); + auto d_timers = cuda::memory::make_unique_span(device, num_timers); + cuda::memory::copy(d_input, input.get()); auto launch_config = cuda::launch_config_builder() .num_blocks(num_blocks) .block_size(num_threads_per_block) .dynamic_shared_memory_size(dynamic_shared_mem_size) .build(); - cuda::launch(kernel_in_module, launch_config, d_input.get(), d_output.get(), d_timers.get()); + cuda::launch(kernel_in_module, launch_config, d_input.data(), d_output.data(), d_timers.data()); device.synchronize(); - cuda::memory::copy(timers.get(), d_timers.get(), num_timers * sizeof(clock_t)); + cuda::memory::copy(timers.get(), d_timers); } // The allocated device buffers are released here long double average_elapsed_clock_ticks_per_block = compute_average_elapsed_clocks(timers.get(), num_blocks); diff --git a/examples/modified_cuda_samples/graphMemoryNodes/graphMemoryNodes.cu b/examples/modified_cuda_samples/graphMemoryNodes/graphMemoryNodes.cu new file mode 100644 index 00000000..f3d071a2 --- /dev/null +++ b/examples/modified_cuda_samples/graphMemoryNodes/graphMemoryNodes.cu @@ -0,0 +1,560 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2023, Eyal Rozenberg + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// System includes +#include +#include + +#include +#include + +// CUDA runtime +#include + +// helper functions and utilities to work with CUDA +#include +#include + +#define THREADS_PER_BLOCK 512 +#define ALLOWABLE_VARIANCE 1.e-6f +#define NUM_ELEMENTS 8000000 + +// Stores the square of each input element in output array +__global__ void squareArray(const float *input, float *output, + int numElements) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numElements) { + output[idx] = input[idx] * input[idx]; + } +} + +// Stores the negative of each input element in output array +__global__ void negateArray(const float *input, float *output, + int numElements) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numElements) { + output[idx] = input[idx] * -1; + } +} + +struct negSquareArrays { + float *input; + float *square; + float *negSquare; + int numElements; + size_t bytes; + size_t numBlocks; +}; + +void fillRandomly(float *array, int numElements) +{ + for (int n = 0; n < numElements; n++) { + array[n] = rand() / (float) RAND_MAX; + } +} + +void resetOutputArrays(negSquareArrays *hostArrays) +{ + fillRandomly(hostArrays->square, hostArrays->numElements); + fillRandomly(hostArrays->negSquare, hostArrays->numElements); +} + +void prepareHostArrays(negSquareArrays *hostArrays) +{ + hostArrays->numElements = NUM_ELEMENTS; + size_t bytes = hostArrays->numElements * sizeof(float); + + size_t numBlocks = hostArrays->numElements / (size_t) THREADS_PER_BLOCK; + if ((numBlocks % (size_t) THREADS_PER_BLOCK) != 0) { + numBlocks++; + } + + hostArrays->input = (float *) malloc(bytes); + hostArrays->square = (float *) malloc(bytes); + hostArrays->negSquare = (float *) malloc(bytes); + hostArrays->bytes = bytes; + hostArrays->numBlocks = numBlocks; + + fillRandomly(hostArrays->input, hostArrays->numElements); + fillRandomly(hostArrays->square, hostArrays->numElements); + fillRandomly(hostArrays->negSquare, hostArrays->numElements); +} + +cuda::graph::instance_t createFreeGraph(float *dPtr) +{ + cudaGraphNode_t freeNode; + + auto graph = cuda::graph::create(); + auto node = graph.insert.node(dPtr); + return graph.instantiate(); +} + +/** + * Demonstrates explicitly creating a CUDA graph including memory nodes. + * createNegateSquaresGraphWithStreamCapture constructs an equivalent graph + * using stream capture. + * + * If d_negSquare_out is non null, then: + * 1) d_negSquare will not be freed; + * 2) the value of d_negSquare_out will be set to d_negSquare. + * + * Diagram of the graph constructed by createNegateSquaresGraphExplicitly: + * + * alloc d_input + * | + * alloc d_square + * | + * Memcpy a to device + * | + * launch kernel squareArray ------->---- Memcpy d_square to host + * | | + * free d_input | + * | | + * allocate d_negSquare | + * | | + * launch kernel negateArray -------->--- free d_square + * | + * Memcpy d_negSquare to host + * | + * free d_negSquare + */ +std::pair +createNegateSquaresGraphExplicitly(int device, negSquareArrays *hostArrays, bool do_neg_squares) +{ + // Array buffers on device + float *d_input, *d_square, *d_negSquare; + + // Memory allocation parameters + cudaMemAllocNodeParams allocParams; + memset(&allocParams, 0, sizeof(allocParams)); + allocParams.bytesize = hostArrays->bytes; + allocParams.poolProps.allocType = cudaMemAllocationTypePinned; + allocParams.poolProps.location.id = device; + allocParams.poolProps.location.type = cudaMemLocationTypeDevice; + + // Kernel launch parameters + cudaKernelNodeParams kernelNodeParams = {0}; + kernelNodeParams.gridDim = dim3(hostArrays->numBlocks, 1, 1); + kernelNodeParams.blockDim = dim3(THREADS_PER_BLOCK, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.extra = NULL; + + cudaGraph_t graph; + cudaGraphNode_t allocNodeInput, allocNodeSquare, allocNodeNegSquare; + cudaGraphNode_t copyNodeInput, copyNodeSquare, copyNodeNegSquare; + cudaGraphNode_t squareKernelNode, negateKernelNode; + cudaGraphNode_t freeNodeInput, freeNodeSquare; + + // Buffer for storing graph node dependencies + std::vector nodeDependencies; + + checkCudaErrors(cudaGraphCreate(&graph, 0)); + + checkCudaErrors( + cudaGraphAddMemAllocNode(&allocNodeInput, graph, NULL, 0, &allocParams)); + d_input = (float *) allocParams.dptr; + + // To keep the graph structure simple (fewer branching dependencies), + // allocNodeSquare should depend on allocNodeInput + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeSquare, graph, + &allocNodeInput, 1, &allocParams)); + d_square = (float *) allocParams.dptr; + + // copyNodeInput needs to depend on allocNodeInput because copyNodeInput + // writes to d_input. It does so here indirectly through allocNodeSquare. + checkCudaErrors(cudaGraphAddMemcpyNode1D( + ©NodeInput, graph, &allocNodeSquare, 1, d_input, hostArrays->input, + hostArrays->bytes, cudaMemcpyHostToDevice)); + + void *squareKernelArgs[3] = {(void *) &d_input, (void *) &d_square, + (void *) &(hostArrays->numElements)}; + kernelNodeParams.func = (void *) squareArray; + kernelNodeParams.kernelParams = (void **) squareKernelArgs; + + // Square kernel depends on copyNodeInput to ensure all data is on the device + // before kernel launch. + checkCudaErrors(cudaGraphAddKernelNode(&squareKernelNode, graph, + ©NodeInput, 1, &kernelNodeParams)); + + checkCudaErrors(cudaGraphAddMemcpyNode1D( + ©NodeSquare, graph, &squareKernelNode, 1, hostArrays->square, + d_square, hostArrays->bytes, cudaMemcpyDeviceToHost)); + + // Free of d_input depends on the square kernel to ensure that d_input is not + // freed while being read by the kernel. It also depends on the alloc of + // d_input via squareKernelNode > copyNodeInput > allocNodeSquare > + // allocNodeInput. + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeInput, graph, + &squareKernelNode, 1, d_input)); + + // Allocation of C depends on free of A so CUDA can reuse the virtual address. + checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeNegSquare, graph, + &freeNodeInput, 1, &allocParams)); + d_negSquare = (float *) allocParams.dptr; + + if (d_negSquare == d_input) { + printf( + "Check verified that d_negSquare and d_input share a virtual " + "address.\n"); + } + + void *negateKernelArgs[3] = {(void *) &d_square, (void *) &d_negSquare, + (void *) &(hostArrays->numElements)}; + kernelNodeParams.func = (void *) negateArray; + kernelNodeParams.kernelParams = (void **) negateKernelArgs; + + checkCudaErrors(cudaGraphAddKernelNode( + &negateKernelNode, graph, &allocNodeNegSquare, 1, &kernelNodeParams)); + + nodeDependencies.push_back(copyNodeSquare); + nodeDependencies.push_back(negateKernelNode); + checkCudaErrors(cudaGraphAddMemFreeNode(&freeNodeSquare, graph, + nodeDependencies.data(), + nodeDependencies.size(), d_square)); + nodeDependencies.clear(); + + checkCudaErrors(cudaGraphAddMemcpyNode1D( + ©NodeNegSquare, graph, &negateKernelNode, 1, hostArrays->negSquare, + d_negSquare, hostArrays->bytes, cudaMemcpyDeviceToHost)); + + if (d_negSquare_out == NULL) { + cudaGraphNode_t freeNodeNegSquare; + checkCudaErrors(cudaGraphAddMemFreeNode( + &freeNodeNegSquare, graph, ©NodeNegSquare, 1, d_negSquare)); + } + else { + *d_negSquare_out = d_negSquare; + } + + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphDestroy(graph)); +} + +cuda::graph::instance_t +createNegateSquaresGraphExplicitly(int device, negSquareArrays *hostArrays) +{ + static constexpr const auto do_neg_squares { true }; + return createNegateSquaresGraphExplicitly(device, hostArrays, do_neg_squares).first; +} + +/** + * Adds work to a CUDA stream which negates the square of values in the input + * array. + * + * If d_negSquare_out is non null, then: + * 1) d_negSquare will not be freed; + * 2) the value of d_negSquare_out will be set to d_negSquare. + * + * Diagram of the stream operations in doNegateSquaresInStream + * --------------------------------------------------------------------- + * | STREAM | STREAM2 | + * --------------------------------------------------------------------- + * + * alloc d_input + * | + * alloc d_square + * | + * Memcpy a to device + * | + * launch kernel squareArray + * | + * record squareKernelCompleteEvent -->-- wait squareKernelCompleteEvent + * | | + * free d_input | + * | | + * allocate d_negSquare Memcpy d_square to host + * | | + * launch kernel negateArray | + * | | + * record negateKernelCompleteEvent -->-- wait negateKernelCompleteEvent + * | | + * Memcpy d_negSquare to host | + * | free d_square + * free d_negSquare | + * | | + * wait squareFreeEvent --------------<---- record squareFreeEvent + */ +void doNegateSquaresInStream(cudaStream_t stream1, negSquareArrays *hostArrays, + float **d_negSquare_out = NULL) +{ + float *d_input, *d_square, *d_negSquare; + cudaStream_t stream2; + cudaEvent_t squareKernelCompleteEvent, negateKernelCompleteEvent, + squareFreeEvent; + + checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); + + checkCudaErrors(cudaEventCreate(&squareKernelCompleteEvent)); + checkCudaErrors(cudaEventCreate(&negateKernelCompleteEvent)); + checkCudaErrors(cudaEventCreate(&squareFreeEvent)); + + // Virtual addresses are assigned synchronously when cudaMallocAsync is + // called, thus there is no performace benefit gained by separating the + // allocations into two streams. + checkCudaErrors(cudaMallocAsync(&d_input, hostArrays->bytes, stream1)); + checkCudaErrors(cudaMallocAsync(&d_square, hostArrays->bytes, stream1)); + + checkCudaErrors(cudaMemcpyAsync(d_input, hostArrays->input, hostArrays->bytes, + cudaMemcpyHostToDevice, stream1)); + squareArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( + d_input, d_square, hostArrays->numElements); + checkCudaErrors(cudaEventRecord(squareKernelCompleteEvent, stream1)); + + checkCudaErrors(cudaStreamWaitEvent(stream2, squareKernelCompleteEvent, 0)); + checkCudaErrors(cudaMemcpyAsync(hostArrays->square, d_square, + hostArrays->bytes, cudaMemcpyDeviceToHost, + stream2)); + + checkCudaErrors(cudaFreeAsync(d_input, stream1)); + checkCudaErrors(cudaMallocAsync(&d_negSquare, hostArrays->bytes, stream1)); + negateArray<<numBlocks, THREADS_PER_BLOCK, 0, stream1>>>( + d_square, d_negSquare, hostArrays->numElements); + checkCudaErrors(cudaEventRecord(negateKernelCompleteEvent, stream1)); + checkCudaErrors(cudaMemcpyAsync(hostArrays->negSquare, d_negSquare, + hostArrays->bytes, cudaMemcpyDeviceToHost, + stream1)); + if (d_negSquare_out == NULL) { + checkCudaErrors(cudaFreeAsync(d_negSquare, stream1)); + } + else { + *d_negSquare_out = d_negSquare; + } + + checkCudaErrors(cudaStreamWaitEvent(stream2, negateKernelCompleteEvent, 0)); + checkCudaErrors(cudaFreeAsync(d_square, stream2)); + checkCudaErrors(cudaEventRecord(squareFreeEvent, stream2)); + + checkCudaErrors(cudaStreamWaitEvent(stream1, squareFreeEvent, 0)); + + checkCudaErrors(cudaStreamDestroy(stream2)); + checkCudaErrors(cudaEventDestroy(squareKernelCompleteEvent)); + checkCudaErrors(cudaEventDestroy(negateKernelCompleteEvent)); + checkCudaErrors(cudaEventDestroy(squareFreeEvent)); +} + +/** + * Demonstrates creating a CUDA graph including memory nodes using stream + * capture. createNegateSquaresGraphExplicitly constructs an equivalent graph + * without stream capture. + */ +cuda::graph::instance_t createNegateSquaresGraphWithStreamCapture(negSquareArrays *hostArrays, + float **d_negSquare_out = NULL) +{ + cudaGraph_t graph; + cudaStream_t stream; + + checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + doNegateSquaresInStream(stream, hostArrays, d_negSquare_out); + checkCudaErrors(cudaStreamEndCapture(stream, &graph)); + + checkCudaErrors(cudaGraphInstantiate(graphExec, graph, NULL, NULL, 0)); + checkCudaErrors(cudaStreamDestroy(stream)); + checkCudaErrors(cudaGraphDestroy(graph)); +} + +void prepareRefArrays(negSquareArrays *hostArrays, + negSquareArrays *deviceRefArrays, + bool **foundValidationFailure) +{ + deviceRefArrays->bytes = hostArrays->bytes; + deviceRefArrays->numElements = hostArrays->numElements; + + for (int i = 0; i < hostArrays->numElements; i++) { + hostArrays->square[i] = hostArrays->input[i] * hostArrays->input[i]; + hostArrays->negSquare[i] = hostArrays->square[i] * -1; + } + + checkCudaErrors( + cudaMalloc((void **) &deviceRefArrays->negSquare, deviceRefArrays->bytes)); + checkCudaErrors(cudaMemcpy(deviceRefArrays->negSquare, hostArrays->negSquare, + hostArrays->bytes, cudaMemcpyHostToDevice)); + + checkCudaErrors( + cudaMallocManaged((void **) foundValidationFailure, sizeof(bool))); +} + +int checkValidationFailure(bool *foundValidationFailure) +{ + if (*foundValidationFailure) { + std::cout << "Validation FAILURE!\n\n"; + *foundValidationFailure = false; + return EXIT_FAILURE; + } + else { + std::cout << "Validation PASSED!\n\n"; + return EXIT_SUCCESS; + } +} + +__global__ void validateGPU(float *d_negSquare, negSquareArrays devRefArrays, + bool *foundValidationFailure) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + float ref, diff; + + if (idx < devRefArrays.numElements) { + ref = devRefArrays.negSquare[idx]; + diff = d_negSquare[idx] - ref; + diff *= diff; + ref *= ref; + if (diff / ref > ALLOWABLE_VARIANCE) { + *foundValidationFailure = true; + } + } +} + +void validateHost(negSquareArrays *hostArrays, bool *foundValidationFailure) +{ + float ref, diff; + + for (int i = 0; i < hostArrays->numElements; i++) { + ref = hostArrays->input[i] * hostArrays->input[i] * -1; + diff = hostArrays->negSquare[i] - ref; + diff *= diff; + ref *= ref; + if (diff / ref > ALLOWABLE_VARIANCE) { + *foundValidationFailure = true; + } + } +} + +int main(int argc, char **argv) +{ + negSquareArrays hostArrays, deviceRefArrays; + + auto launch_config = cuda::launch_config_builder() + .block_dimensions(THREADS_PER_BLOCK) + .grid_dimensions(hostArrays.numBlocks) + .no_dynamic_shared_memory() + .build(); + + // Declare pointers for GPU buffers + float *d_negSquare = NULL; + bool *foundValidationFailure = NULL; + + srand(time(0)); + + // Being very cavalier about our command-line arguments here... + cuda::device::id_t device_id = (argc > 1) ? std::stoi(argv[1]) : cuda::device::default_device_id; + auto device = cuda::device::get(device_id); + + if (cuda::version_numbers::driver() < cuda::version_numbers::make(11040)) { + std::cout << "Waiving execution as driver does not support Graph Memory Nodes\n"; + exit(EXIT_SUCCESS); + } + + if (not device.supports_memory_pools()) { + std::cout << "Waiving execution as device does not support Memory Pools\n"; + exit(EXIT_SUCCESS); + } + std::cout << "Setting up sample.\n"; + + prepareHostArrays(&hostArrays); + prepareRefArrays(&hostArrays, &deviceRefArrays, &foundValidationFailure); + auto stream = device.create_stream(cuda::stream::async); + std::cout << "Setup complete.\n\n"; + + std::cout << "Running negateSquares in a stream.\n"; + doNegateSquaresInStream(stream.handle(), &hostArrays); + std::cout << "Validating negateSquares in a stream...\n"; + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + + { + std::cout << "Running negateSquares in a stream-captured graph.\n"; + auto executable_graph_instance = createNegateSquaresGraphWithStreamCapture(&hostArrays); + cuda::graph::launch(executable_graph_instance, stream); + stream.synchronize(); + std::cout << "Validating negateSquares in a stream-captured graph...\n"; + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + } + + { + std::cout << "Running negateSquares in an explicitly constructed graph.\n"; + auto executable_graph_instance = createNegateSquaresGraphExplicitly(device.id(), &hostArrays); + cuda::graph::launch(executable_graph_instance, stream); + stream.synchronize(); + std::cout << "Validating negateSquares in an explicitly constructed graph...\n"; + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + } + // Each of the three examples below free d_negSquare outside the graph. As + // demonstrated by validateGPU, d_negSquare can be accessed by outside the + // graph before d_negSquare is freed. + + { + std::cout << "Running negateSquares with d_negSquare freed outside the stream.\n"; + static constexpr const auto compute_neg_squares { true }; + auto pair = createNegateSquaresGraphExplicitly(device.id(), &hostArrays, compute_neg_squares); + auto executable_graph_instance = std::move(pair.first); + auto d_negSquare = std::move(pair.second); + auto free_graph_instance = createFreeGraph(d_negSquare); + cuda::graph::launch(executable_graph_instance, stream); + stream.enqueue.kernel_launch(validateGPU, launch_config, d_negSquare, deviceRefArrays, foundValidationFailure); + stream.synchronize(); + printf( + "Validating negateSquares with d_negSquare freed outside the " + "stream...\n"); + validateHost(&hostArrays, foundValidationFailure); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + + std::cout << "Running negateSquares with d_negSquare freed outside the graph.\n"; + cuda::graph::launch(executable_graph_instance, stream); + stream.enqueue.kernel_launch(validateGPU, launch_config, d_negSquare, deviceRefArrays, foundValidationFailure); + stream.synchronize(); + printf( + "Validating negateSquares with d_negSquare freed outside the graph...\n"); + checkValidationFailure(foundValidationFailure); + resetOutputArrays(&hostArrays); + // TODO: What about the instance vs the FreeC? + printf( + "Running negateSquares with d_negSquare freed in a different graph.\n"); + cuda::graph::launch(executable_graph_instance, stream); + stream.enqueue.kernel_launch(validateGPU, launch_config, d_negSquare, deviceRefArrays, foundValidationFailure); + cuda::graph::launch(free_graph_instance, stream); + stream.synchronize(); + printf( + "Validating negateSquares with d_negSquare freed in a different " + "graph...\n"); + checkValidationFailure(foundValidationFailure); + + } + + std::cout << "\nSUCCESS\n"; +} \ No newline at end of file diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu index fb861605..32baa02b 100644 --- a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu +++ b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu @@ -43,26 +43,26 @@ int main(int, char **) cuda::device::current::set_to_default(); auto device = cuda::device::current::get(); - auto d_ptr = cuda::memory::make_unique(device, N); - auto h_ptr = cuda::memory::host::make_unique(N); + auto d_span = cuda::memory::make_unique_span(device, N); + auto h_span = cuda::memory::host::make_unique_span(N); std::cout << "Generating data on CPU\n"; - sequence_cpu(h_ptr.get(), N); + sequence_cpu(h_span.data(), h_span.size()); auto launch_config = cuda::launch_config_builder() .overall_size(N) .block_size(256) .build(); - device.launch(sequence_gpu, launch_config, d_ptr.get(), N); + device.launch(sequence_gpu, launch_config, d_span.data(), d_span.size()); cuda::outstanding_error::ensure_none(); device.synchronize(); - auto h_d_ptr = cuda::memory::host::make_unique(N); - cuda::memory::copy(h_d_ptr.get(), d_ptr.get(), N * sizeof(int)); + auto h_d_span = cuda::memory::host::make_unique_span(N); + cuda::memory::copy(h_d_span, d_span); - auto results_are_correct = std::equal(h_ptr.get(), h_ptr.get() + N, h_d_ptr.get()); + auto results_are_correct = std::equal(h_span.begin(), h_span.end(), h_d_span.begin()); if (not results_are_correct) { die_("Results check failed."); } diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu index 7592441d..d440b1d7 100644 --- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu +++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -124,9 +124,8 @@ void checkP2Paccess() } void enqueue_p2p_copy( - int *dest, - int *src, - std::size_t num_elems, + cuda::memory::region_t dest, + cuda::memory::region_t src, int repeat, bool p2paccess, P2PEngine p2p_mechanism, @@ -145,7 +144,7 @@ void enqueue_p2p_copy( auto launch_config = cuda::launch_configuration_t{grid_and_block_dims}; for (int r = 0; r < repeat; r++) { - stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest, (int4*)src, num_elems/sizeof(int4)); + stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest.data(), (int4*)src.data(), src.size()/sizeof(int4)); } } else @@ -155,7 +154,7 @@ void enqueue_p2p_copy( // Since we assume Compute Capability >= 2.0, all devices support the // Unified Virtual Address Space, so we don't need to use // cudaMemcpyPeerAsync - cudaMemcpyAsync is enough. - cuda::memory::async::copy(dest, src, sizeof(*dest)*num_elems, stream); + cuda::memory::async::copy(dest, src, stream); } } } @@ -165,8 +164,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p int numElems = 10000000; int repeat = 5; vector streams; - vector> buffers; - vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy + vector> buffers; + vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy vector start; vector stop; @@ -176,8 +175,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p for (auto device : cuda::devices()) { streams.push_back(device.create_stream(cuda::stream::async)); - buffers.push_back(cuda::memory::make_unique(device, numElems)); - buffersD2D.push_back(cuda::memory::make_unique(device, numElems)); + buffers.push_back(cuda::memory::make_unique_span(device, numElems)); + buffersD2D.push_back(cuda::memory::make_unique_span(device, numElems)); start.push_back(device.create_event()); stop.push_back(device.create_event()); } @@ -212,17 +211,17 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p if (i == j) { // Perform intra-GPU, D2D copies - enqueue_p2p_copy(buffers[i].get(), buffersD2D[i].get(), numElems, repeat, p2p_access_possible, mechanism, streams[i]); + enqueue_p2p_copy(buffers[i], buffersD2D[i], repeat, p2p_access_possible, mechanism, streams[i]); } else { if (p2p_method == P2P_WRITE) { - enqueue_p2p_copy(buffers[j].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, mechanism, streams[i]); + enqueue_p2p_copy(buffers[j], buffers[i], repeat, p2p_access_possible, mechanism, streams[i]); } else { - enqueue_p2p_copy(buffers[i].get(), buffers[j].get(), numElems, repeat, p2p_access_possible, mechanism, streams[i]); + enqueue_p2p_copy(buffers[i], buffers[j], repeat, p2p_access_possible, mechanism, streams[i]); } } @@ -295,8 +294,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p) vector streams_0; vector streams_1; - vector> buffers; - vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy + vector> buffers; + vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy vector start; vector stop; @@ -308,8 +307,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p) for (auto device : cuda::devices()) { streams_0.push_back(device.create_stream(cuda::stream::async)); streams_1.push_back(device.create_stream(cuda::stream::async)); - buffers.push_back(cuda::memory::make_unique(device, numElems)); - buffersD2D.push_back(cuda::memory::make_unique(device, numElems)); + buffers.push_back(cuda::memory::make_unique_span(device, numElems)); + buffersD2D.push_back(cuda::memory::make_unique_span(device, numElems)); start.push_back(device.create_event()); stop.push_back(device.create_event()); } @@ -350,12 +349,12 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p) if (i == j) { // For intra-GPU perform 2 memcopies buffersD2D <-> buffers - enqueue_p2p_copy(buffers[i].get(), buffersD2D[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_0[i]); - enqueue_p2p_copy(buffersD2D[i].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_1[i]); + enqueue_p2p_copy(buffers[i], buffersD2D[i], repeat, p2p_access_possible, p2p_mechanism, streams_0[i]); + enqueue_p2p_copy(buffersD2D[i], buffers[i], repeat, p2p_access_possible, p2p_mechanism, streams_1[i]); } else { - enqueue_p2p_copy(buffers[i].get(), buffers[j].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_1[j]); - enqueue_p2p_copy(buffers[j].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams_0[i]); + enqueue_p2p_copy(buffers[i], buffers[j], repeat, p2p_access_possible, p2p_mechanism, streams_1[j]); + enqueue_p2p_copy(buffers[j], buffers[i], repeat, p2p_access_possible, p2p_mechanism, streams_0[i]); } // Notify stream0 that stream1 is complete and record the time of @@ -406,8 +405,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer // vector streams; - vector> buffers; - vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy + vector> buffers; + vector> buffersD2D; // buffer for D2D, that is, intra-GPU copy vector start; vector stop; @@ -417,8 +416,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer for(auto device : cuda::devices()) { streams.push_back(device.create_stream(cuda::stream::async)); - buffers.push_back(cuda::memory::make_unique(device, numElems)); - buffersD2D.push_back(cuda::memory::make_unique(device, numElems)); + buffers.push_back(cuda::memory::make_unique_span(device, numElems)); + buffersD2D.push_back(cuda::memory::make_unique_span(device, numElems)); start.push_back(device.create_event()); stop.push_back(device.create_event()); } @@ -455,16 +454,16 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer auto time_before_copy = std::chrono::high_resolution_clock::now(); if (i == j) { // Perform intra-GPU, D2D copies - enqueue_p2p_copy(buffers[i].get(), buffersD2D[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams[i]); + enqueue_p2p_copy(buffers[i], buffersD2D[i], repeat, p2p_access_possible, p2p_mechanism, streams[i]); } else { if (p2p_method == P2P_WRITE) { - enqueue_p2p_copy(buffers[j].get(), buffers[i].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams[i]); + enqueue_p2p_copy(buffers[j], buffers[i], repeat, p2p_access_possible, p2p_mechanism, streams[i]); } else { - enqueue_p2p_copy(buffers[i].get(), buffers[j].get(), numElems, repeat, p2p_access_possible, p2p_mechanism, streams[i]); + enqueue_p2p_copy(buffers[i], buffers[j], repeat, p2p_access_possible, p2p_mechanism, streams[i]); } } auto time_after_copy = std::chrono::high_resolution_clock::now(); diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp index ae7924c6..352d037f 100644 --- a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp +++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp @@ -147,13 +147,13 @@ int main(int argc, char** argv) std::generate_n(h_B.get(), N, generator); // Allocate vectors in device memory - auto d_A = cuda::memory::make_unique(device, N); - auto d_B = cuda::memory::make_unique(device, N); - auto d_C = cuda::memory::make_unique(device, N); + auto d_A = cuda::memory::make_unique_span(device, N); + auto d_B = cuda::memory::make_unique_span(device, N); + auto d_C = cuda::memory::make_unique_span(device, N); - cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream); - cuda::memory::async::copy(d_B.get(), h_B.get(), size, stream); + cuda::memory::async::copy(d_A, h_A.get(), size, stream); + cuda::memory::async::copy(d_B, h_B.get(), size, stream); auto launch_config = cuda::launch_config_builder() .overall_size(N) @@ -162,9 +162,9 @@ int main(int argc, char** argv) cuda::outstanding_error::ensure_none(); - stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.get(), d_B.get(), d_C.get(), N); + stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N); - cuda::memory::async::copy(h_C.get(), d_C.get(), size, stream); + cuda::memory::async::copy(h_C.get(), d_C, size, stream); stream.synchronize(); for (int i = 0; i < N; ++i) { diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu index e68c5568..88230be3 100644 --- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu +++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu @@ -67,11 +67,12 @@ __global__ void init_array(int *g_data, const int *factor, int num_iterations) } } -bool check_resulting_data(const int *a, const int n, const int c) +template +bool check_resulting_data(Container const & container, const int c) { - for (int i = 0; i < n; i++) { - if (a[i] != c) { - std::cerr << i << ": " << a[i] << " " << c << "\n"; + for (size_t i = 0; i < container.size(); i++) { + if (container[i] != c) { + std::cerr << i << ": " << container[i] << " " << c << "\n"; return false; } } @@ -111,13 +112,13 @@ void run_simple_streams_example( int c = 5; // value to which the array will be initialized // Allocate Host memory - auto h_a = cuda::memory::host::make_unique(params.n); + auto h_a = cuda::memory::host::make_unique_span(params.n); // allocate device memory // pointers to data and init value in the device memory - auto d_a = cuda::memory::make_unique(device, params.n); - auto d_c = cuda::memory::make_unique(device); - cuda::memory::copy_single(d_c.get(), &c); + auto d_a = cuda::memory::make_unique_span(device, params.n); + auto d_c = cuda::memory::make_unique_span(device, 1); + cuda::memory::copy_single(d_c.data(), &c); std::cout << "\nStarting Test\n"; @@ -142,7 +143,7 @@ void run_simple_streams_example( // time memcpy from device start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed - cuda::memory::async::copy(h_a.get(), d_a.get(), nbytes, streams[0]); + cuda::memory::async::copy(h_a.get(), d_a, streams[0]); stop_event.record(); stop_event.synchronize(); // block until the event is actually recorded auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event); @@ -154,7 +155,7 @@ void run_simple_streams_example( .block_size(512) .build(); start_event.record(); - streams[0].enqueue.kernel_launch(init_array, launch_config, d_a.get(), d_c.get(), params.num_iterations); + streams[0].enqueue.kernel_launch(init_array, launch_config, d_a.data(), d_c.data(), params.num_iterations); stop_event.record(); stop_event.synchronize(); auto time_kernel = cuda::event::time_elapsed_between(start_event, stop_event); @@ -170,8 +171,8 @@ void run_simple_streams_example( for (int k = 0; k < nreps; k++) { - device.launch(init_array, launch_config, d_a.get(), d_c.get(), params.num_iterations); - cuda::memory::copy(h_a.get(), d_a.get(), nbytes); + device.launch(init_array, launch_config, d_a.data(), d_c.data(), params.num_iterations); + cuda::memory::copy(h_a.get(), d_a); } stop_event.record(); @@ -186,11 +187,11 @@ void run_simple_streams_example( .block_size(512) .build(); // TODO: Avoid need to push and pop here - memset(h_a.get(), 255, nbytes); // set host memory bits to all 1s, for testing correctness + std::fill(h_a.begin(), h_a.end(), 255); // set host memory bits to all 1s, for testing correctness // This instruction is actually the only one in our program // for which the device.make_current() command was necessary. // TODO: Avoid having to do that altogether... - cuda::memory::device::zero(cuda::memory::region_t{d_a.get(), nbytes}); // set device memory to all 0s, for testing correctness + cuda::memory::device::zero(d_a); // set device memory to all 0s, for testing correctness start_event.record(); for (int k = 0; k < nreps; k++) @@ -199,7 +200,7 @@ void run_simple_streams_example( for (int i = 0; i < nstreams; i++) { streams[i].enqueue.kernel_launch( - init_array, launch_config, d_a.get() + i * params.n / nstreams, d_c.get(), params.num_iterations); + init_array, launch_config, d_a.data() + i * params.n / nstreams, d_c.data(), params.num_iterations); } // asynchronously launch nstreams memcopies. Note that memcopy in stream x will only @@ -207,8 +208,8 @@ void run_simple_streams_example( for (int i = 0; i < nstreams; i++) { cuda::memory::async::copy( - h_a.get() + i * params.n / nstreams, - d_a.get() + i * params.n / nstreams, nbytes / nstreams, + h_a.data() + i * params.n / nstreams, + d_a.data() + i * params.n / nstreams, nbytes / nstreams, streams[i]); } } @@ -220,7 +221,7 @@ void run_simple_streams_example( // check whether the output is correct std::cout << "-------------------------------\n"; - if (not check_resulting_data(h_a.get(), params.n, c * nreps * params.num_iterations)) { + if (not check_resulting_data(h_a, c * nreps * params.num_iterations)) { die_("Result check FAILED."); } } diff --git a/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu b/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu index 9cfac063..1a6be6e2 100644 --- a/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu +++ b/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu @@ -155,9 +155,10 @@ int streamOrderedAllocationPostSync( // Record the start event auto start_event = stream.enqueue.event(); for (int i = 0; i < MAX_ITER; i++) { - auto d_a = span(stream.enqueue.allocate(a.size() * sizeof(float))); - auto d_b = span(stream.enqueue.allocate(b.size() * sizeof(float))); - auto d_c = span(stream.enqueue.allocate(c.size() * sizeof(float))); + // Not: Not using unique_span's, + auto d_a = cuda::span(stream.enqueue.allocate(a.size() * sizeof(float))); + auto d_b = cuda::span(stream.enqueue.allocate(b.size() * sizeof(float))); + auto d_c = cuda::span(stream.enqueue.allocate(c.size() * sizeof(float))); stream.enqueue.copy(d_a, a); stream.enqueue.copy(d_b, b); stream.enqueue.kernel_launch(vectorAddGPU, launch_config, d_a.data(), d_b.data(), d_c.data(), c.size()); diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu index bf8e24f1..77a8857a 100644 --- a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu +++ b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu @@ -30,10 +30,8 @@ int main() } int numElements = 50000; - size_t size = numElements * sizeof(float); std::cout << "[Vector addition of " << numElements << " elements]\n"; - // If we could rely on C++14, we would use std::make_unique auto h_A = std::unique_ptr(new float[numElements]); auto h_B = std::unique_ptr(new float[numElements]); auto h_C = std::unique_ptr(new float[numElements]); @@ -43,12 +41,12 @@ int main() std::generate(h_B.get(), h_B.get() + numElements, generator); auto device = cuda::device::current::get(); - auto d_A = cuda::memory::make_unique(device, numElements); - auto d_B = cuda::memory::make_unique(device, numElements); - auto d_C = cuda::memory::make_unique(device, numElements); + auto d_A = cuda::memory::make_unique_span(device, numElements); + auto d_B = cuda::memory::make_unique_span(device, numElements); + auto d_C = cuda::memory::make_unique_span(device, numElements); - cuda::memory::copy(d_A.get(), h_A.get(), size); - cuda::memory::copy(d_B.get(), h_B.get(), size); + cuda::memory::copy(d_A, h_A.get()); + cuda::memory::copy(d_B, h_B.get()); auto launch_config = cuda::launch_config_builder() .overall_size(numElements) @@ -61,10 +59,10 @@ int main() cuda::launch( vectorAdd, launch_config, - d_A.get(), d_B.get(), d_C.get(), numElements + d_A.data(), d_B.data(), d_C.data(), numElements ); - cuda::memory::copy(h_C.get(), d_C.get(), size); + cuda::memory::copy(h_C.get(), d_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { diff --git a/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp b/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp index 384c1dd5..87ca20f9 100644 --- a/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp +++ b/examples/modified_cuda_samples/vectorAddMMAP/vectorAddMMAP.cpp @@ -99,8 +99,14 @@ cuda::size_t determine_reservation_size( template