Skip to content

Commit

Permalink
Fixes #578: Leaning more into the use of the launch config builder.
Browse files Browse the repository at this point in the history
* Dropped `make_launch_config()`; you now either construct a `launch_configuration_t`, or use a launch config builder.
* All examples which were using `make_launch_config()` now use either simple construction of a builder.
  • Loading branch information
eyalroz committed Jan 29, 2024
1 parent e11ecc3 commit 0e94573
Show file tree
Hide file tree
Showing 13 changed files with 71 additions and 103 deletions.
9 changes: 3 additions & 6 deletions examples/by_api_module/execution_control.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,6 @@ int main(int argc, char **argv)
kernel.set_shared_memory_bank_size(
cuda::multiprocessor_shared_memory_bank_size_option_t::four_bytes_per_bank);

// You may be wondering why we're only setting these "attributes' but not
// obtaining their existing values. Well - we can't! The runtime doesn't expose
// API calls for that (as of CUDA v8.0).

// ------------------
// Kernel launching
Expand All @@ -85,14 +82,14 @@ int main(int argc, char **argv)
const unsigned num_blocks = 3;
std::cout << "Getting kernel attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" << std::endl;
auto max_threads_per_block = kernel.get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
auto launch_config = cuda::make_launch_config(num_blocks, max_threads_per_block);
auto launch_config = cuda::launch_configuration_t(num_blocks, max_threads_per_block);
std::cout
<< "Launching kernel " << kernel_name
<< " with " << num_blocks << " blocks, using cuda::launch()" << std::endl;
{
// Copy and move construction and assignment of launch configurations
auto launch_config_2 = cuda::make_launch_config(2, 2, 2);
auto launch_config_3 = cuda::make_launch_config(3, 3, 3);
auto launch_config_2 = cuda::launch_configuration_t{2, 2};
auto launch_config_3 = cuda::launch_configuration_t{3, 3};
cuda::launch_configuration_t launch_config_4{launch_config};
(void) launch_config_4;
launch_config_4 = launch_config_2;
Expand Down
2 changes: 1 addition & 1 deletion examples/by_api_module/stream_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ int main(int argc, char **argv)
stream_1.enqueue.host_invokable(callback);
auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
auto num_blocks = div_rounding_up(buffer_size, threads_per_block);
auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
auto launch_config = cuda::launch_configuration_t{num_blocks, threads_per_block};
// TODO: The following doesn't have much of a meaningful effect; we should modify this example
// so that the attachment has some observable effect
stream_1.enqueue.attach_managed_region(buffer.get());
Expand Down
7 changes: 4 additions & 3 deletions examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ int main(int, char **)

sequence_cpu(h_ptr.get(), N);

auto block_size = 256;
auto grid_size = div_rounding_up(N, block_size);
auto launch_config = cuda::make_launch_config(grid_size, block_size);
auto launch_config = cuda::launch_config_builder()
.overall_size(N)
.block_size(256)
.build();
device.launch(sequence_gpu, launch_config, d_ptr.get(), N);

cuda::outstanding_error::ensure_none();
Expand Down
16 changes: 5 additions & 11 deletions examples/modified_cuda_samples/memMapIPCDrv/child.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,16 +146,6 @@ bool results_are_valid(
return true;
}

cuda::launch_configuration_t make_launch_config(const cuda::device_t &device, const cuda::kernel_t &kernel)
{
const int num_threads_per_block = 128;
auto max_active_blocks_per_sm = kernel.max_active_blocks_per_multiprocessor(num_threads_per_block, 0);
auto num_blocks = max_active_blocks_per_sm * device.multiprocessor_count();
auto launch_config = cuda::make_launch_config(num_blocks, num_threads_per_block);
return launch_config;
}


void childProcess(int devId, int id_of_this_child, char **)
{
auto device{cuda::device::get(devId)};
Expand All @@ -179,7 +169,11 @@ void childProcess(int devId, int id_of_this_child, char **)
auto module = cuda::module::create(device, fatbin);

auto kernel = module.get_kernel(kernel::name);
auto launch_config = make_launch_config(device, kernel);
auto launch_config = cuda::launch_config_builder()
.kernel(&kernel)
.block_size(128)
.saturate_with_active_blocks().build();


for (int sibling_process_offset = 0; sibling_process_offset < num_processes; sibling_process_offset++) {
// Interact with (cyclically) consecutive child processes after
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ void enqueue_p2p_copy(
auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
// Note: We could have alternatively used:
// auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
auto launch_config = cuda::make_launch_config(grid_and_block_dims);
auto launch_config = cuda::launch_configuration_t{grid_and_block_dims};

for (int r = 0; r < repeat; r++) {
stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest, (int4*)src, num_elems/sizeof(int4));
Expand Down Expand Up @@ -444,11 +444,11 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer

// Block the stream until all the work is queued up
// DANGER! - cudaMemcpy*Async may infinitely block waiting for
// room to push the operation, so keep the number of repeatitions
// relatively low. Higher repeatitions will cause the delay kernel
// room to push the operation, so keep the number of repetitions
// relatively low. Higher repetitions will cause the delay kernel
// to timeout and lead to unstable results.
*flag = 0;
auto single_thread = cuda::make_launch_config(cuda::grid::dimensions_t::point(), cuda::grid::block_dimensions_t::point());
auto single_thread = cuda::launch_configuration_t(cuda::grid::dimensions_t::point(), cuda::grid::block_dimensions_t::point());
streams[i].enqueue.kernel_launch(delay, single_thread, flag, default_timeout_clocks);
streams[i].enqueue.event(start[i]);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,10 @@ int main(int argc, char** argv)
cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);
cuda::memory::async::copy(d_B.get(), h_B.get(), size, stream);

auto threadsPerBlock = 256;
auto blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
auto launch_config = cuda::make_launch_config( blocksPerGrid, threadsPerBlock );
auto launch_config = cuda::launch_config_builder()
.overall_size(N)
.block_size(256)
.build();

cuda::outstanding_error::ensure_none();

Expand Down
24 changes: 12 additions & 12 deletions examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@ void run_simple_streams_example(
int nstreams = 4; // number of streams for CUDA calls
int nreps = 10; // number of times each experiment is repeated; originally 10
std::size_t nbytes = params.n * sizeof(int); // number of data bytes
dim3 threads, blocks; // kernel launch configuration

int c = 5; // value to which the array will be initialized

// Allocate Host memory
Expand Down Expand Up @@ -151,10 +149,10 @@ void run_simple_streams_example(
std::cout << "memcopy:\t" << time_memcpy.count() << "\n";

// time kernel
threads=dim3(512, 1);
assert_(params.n % threads.x == 0);
blocks=dim3(params.n / threads.x, 1);
auto launch_config = cuda::make_launch_config(blocks, threads);
auto launch_config = cuda::launch_config_builder()
.overall_size(params.n)
.block_size(512)
.build();
start_event.record();
streams[0].enqueue.kernel_launch(init_array, launch_config, d_a.get(), d_c.get(), params.num_iterations);
stop_event.record();
Expand All @@ -164,9 +162,10 @@ void run_simple_streams_example(

//////////////////////////////////////////////////////////////////////
// time non-streamed execution for reference
threads=dim3(512, 1);
blocks=dim3(params.n / threads.x, 1);
launch_config = cuda::make_launch_config(blocks, threads);
launch_config = cuda::launch_config_builder()
.overall_size(params.n)
.block_size(512)
.build();
start_event.record();

for (int k = 0; k < nreps; k++)
Expand All @@ -182,9 +181,10 @@ void run_simple_streams_example(

//////////////////////////////////////////////////////////////////////
// time execution with nstreams streams
threads=dim3(512,1);
blocks=dim3(params.n/(nstreams*threads.x),1);
launch_config = cuda::make_launch_config(blocks, threads);
launch_config = cuda::launch_config_builder()
.overall_size(params.n/nstreams)
.block_size(512)
.build();
// TODO: Avoid need to push and pop here
memset(h_a.get(), 255, nbytes); // set host memory bits to all 1s, for testing correctness
// This instruction is actually the only one in our program
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ int main()
std::generate(buffer_B.get(), buffer_B.get() + numElements, generator);

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
auto launch_config = cuda::launch_config_builder()
.overall_size(numElements)
.block_size(256)
.build();

std::cout
<< "CUDA kernel launch with " << blocksPerGrid
<< " blocks of " << threadsPerBlock << " threads\n";
<< "CUDA kernel launch with " << launch_config.dimensions.grid.volume()
<< " blocks of " << launch_config.dimensions.block.volume() << " threads\n";

cuda::launch(
vectorAdd,
cuda::make_launch_config( blocksPerGrid, threadsPerBlock ),
vectorAdd, launch_config,
buffer_A.get(), buffer_B.get(), buffer_C.get(), numElements
);

Expand Down
18 changes: 10 additions & 8 deletions examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,19 @@ int main()
std::generate(h_B, h_B + numElements, generator);

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;

auto launch_config = cuda::launch_config_builder()
.overall_size(numElements)
.block_size(256)
.build();

std::cout
<< "CUDA kernel launch with " << blocksPerGrid
<< " blocks of " << threadsPerBlock << " threads\n";
<< "CUDA kernel launch with " << launch_config.dimensions.grid.volume()
<< " blocks of " << launch_config.dimensions.block.volume() << " threads\n";

cuda::launch(
vectorAdd,
cuda::make_launch_config( blocksPerGrid, threadsPerBlock ),
d_A, d_B, d_C, numElements
);
vectorAdd, launch_config,
d_A, d_B, d_C, numElements);

// Synchronization is necessary here despite the synchronous nature of the default stream -
// since the copying-back of data is not something we've waited for
Expand Down
18 changes: 8 additions & 10 deletions examples/other/array_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,11 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
grid::dimension_t( div_rounding_up(h, block_dim) ),
grid::dimension_t( div_rounding_up(d, block_dim) )
};

auto launch_config = cuda::launch_configuration_t{grid_dims, block_dims};
cuda::launch(
kernels::from_3D_texture_to_memory_space,
cuda::make_launch_config(grid_dims, block_dims),
launch_config,
tv.raw_handle(), ptr_out.get(), w, h, d);
device.synchronize();
check_output_is_iota("copy from 3D texture into (managed) global memory", ptr_out.get(), arr.size());
Expand Down Expand Up @@ -115,8 +117,6 @@ void print_2d_array(const char* title, const T* a, size_t width, size_t height)

void array_2d_example(cuda::device_t& device, size_t w, size_t h)
{
namespace grid = cuda::grid;

const cuda::array::dimensions_t<2> dims = {w, h};
auto arr = cuda::array::create<float>(device , dims);
auto ptr_in = cuda::memory::managed::make_unique<float[]>(arr.size());
Expand All @@ -130,14 +130,12 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
cuda::texture_view tv(arr);

constexpr cuda::grid::block_dimension_t block_dim = 10;
constexpr auto block_dims = cuda::grid::block_dimensions_t::square(block_dim);
assert(div_rounding_up(w, block_dim) <= std::numeric_limits<grid::dimension_t>::max());
assert(div_rounding_up(h, block_dim) <= std::numeric_limits<grid::dimension_t>::max());
const cuda::grid::dimensions_t grid_dims = {
grid::dimension_t( div_rounding_up(w, block_dim) ),
grid::dimension_t( div_rounding_up(h, block_dim) ),
1
};
auto launch_config = cuda::launch_config_builder()
.overall_dimensions(w, h)
.block_dimensions(block_dim, block_dim)
.build();

auto ptr_out = cuda::memory::managed::make_unique<float[]>(arr.size());
// The following is to make it easier to notice if nothing get copied
Expand All @@ -147,7 +145,7 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)

cuda::launch(
kernels::from_2D_texture_to_memory_space,
cuda::make_launch_config(grid_dims, block_dims),
launch_config,
tv.raw_handle(), ptr_out.get(), w, h);
cuda::memory::copy(ptr_out.get(), arr);
device.synchronize();
Expand Down
25 changes: 8 additions & 17 deletions examples/other/io_compute_overlap_with_streams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,6 @@ constexpr I div_rounding_up(I dividend, const I2 divisor) noexcept
return (dividend / divisor) + !!(dividend % divisor);
}

/*
* Produce a launch configuration with one thread covering each element
*/
cuda::launch_configuration_t make_linear_launch_config(
const cuda::device_t device,
size_t length)
{
auto threads_per_block = device.properties().max_threads_per_block();
auto num_blocks = div_rounding_up(length, threads_per_block);
if (num_blocks > std::numeric_limits<cuda::grid::dimension_t>::max()) {
throw std::invalid_argument("Specified length exceeds CUDA's support for a linear grid");
}
return cuda::make_launch_config((cuda::grid::dimensions_t) num_blocks, threads_per_block, cuda::no_dynamic_shared_memory);
}

struct buffer_set_t {
cuda::memory::host::unique_ptr<element_t[]> host_lhs;
cuda::memory::host::unique_ptr<element_t[]> host_rhs;
Expand Down Expand Up @@ -115,8 +100,14 @@ int main(int, char **)
std::generate_n(std::back_inserter(streams), num_kernels,
[&]() { return device.create_stream(cuda::stream::async); });

auto common_launch_config = make_linear_launch_config(device, num_elements);
auto buffer_size = num_elements * sizeof(element_t);
auto common_launch_config = cuda::launch_config_builder()
.device(device)
.overall_size(num_elements)
.use_maximum_linear_block()
.build();


auto buffer_size = num_elements * sizeof(element_t);

std::cout
<< "Running " << num_kernels << " sequences of HtoD-kernel-DtoH, in parallel" << std::endl;
Expand Down
8 changes: 4 additions & 4 deletions examples/other/jitify/jitify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void my_kernel(T* data) {
T h_data = 5;
cuda::memory::copy_single<T>(d_data.get(), &h_data);

auto single_thread_launch_config = cuda::make_launch_config( cuda::grid::composite_dimensions_t::point());
auto single_thread_launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
device.launch(kernel, single_thread_launch_config, d_data.get());
cuda::memory::copy_single<T>(&h_data, d_data.get());
return are_close(h_data, 125.f);
Expand Down Expand Up @@ -247,7 +247,7 @@ void my_kernel2(float const* indata, float* outdata) {
T inval = 3.14159f;
cuda::memory::copy_single<T>(indata.get(), &inval);

auto launch_config = cuda::make_launch_config(cuda::grid::composite_dimensions_t::point());
auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
cuda::launch(my_kernel1, launch_config, indata.get(), outdata.get());
cuda::launch(my_kernel2, launch_config, indata.get(), outdata.get());

Expand Down Expand Up @@ -309,7 +309,7 @@ __global__ void constant_test(int *x) {
cuda::memory::copy(b_a, &inval[1]);
cuda::memory::copy(c_b_a, &inval[2]);
auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
auto launch_config = cuda::make_launch_config(cuda::grid::composite_dimensions_t::point());
auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
cuda::launch(kernel, launch_config, outdata.get());
int outval[n_const];
cuda::memory::copy(outval, outdata.get(), sizeof(outval));
Expand Down Expand Up @@ -341,7 +341,7 @@ bool test_constant_2()
auto kernel = module.get_kernel(compilation_result.get_mangling_of(second_kernel_name));
int inval[] = {3, 5, 9};
cuda::memory::copy(anon_b_a, inval);
auto launch_config = cuda::make_launch_config(cuda::grid::composite_dimensions_t::point());
auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
cuda::launch(kernel, launch_config, outdata.get());
int outval[n_const];
Expand Down
18 changes: 0 additions & 18 deletions src/cuda/api/launch_configuration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,24 +143,6 @@ struct launch_configuration_t {
launch_configuration_t& operator=(launch_configuration_t&&) = default;
};

/**
* @brief a named constructor idiom for a @ref launch_config_t
*/
constexpr launch_configuration_t make_launch_config(
grid::composite_dimensions_t grid_and_block_dimensions,
memory::shared::size_t dynamic_shared_memory_size = 0u) noexcept
{
return { grid_and_block_dimensions, dynamic_shared_memory_size };
}

constexpr launch_configuration_t make_launch_config(
grid::dimensions_t grid_dimensions,
grid::block_dimensions_t block_dimensions,
memory::shared::size_t dynamic_shared_memory_size = 0u) noexcept
{
return { { grid_dimensions, block_dimensions }, dynamic_shared_memory_size };
}

constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
{
return
Expand Down

0 comments on commit 0e94573

Please sign in to comment.