Fixes #578: Leaning more into the use of the launch config builder.

* Dropped `make_launch_config()`; you now either construct a `launch_configuration_t`, or use a launch config builder. * All examples which were using `make_launch_config()` now use either simple construction of a builder.
eyalroz · Jan 29, 2024 · 0e94573 · 0e94573
1 parent e11ecc3
commit 0e94573
Show file tree

Hide file tree

Showing 13 changed files with 71 additions and 103 deletions.
diff --git a/examples/by_api_module/execution_control.cu b/examples/by_api_module/execution_control.cu
@@ -73,9 +73,6 @@ int main(int argc, char **argv)
 	kernel.set_shared_memory_bank_size(
 		cuda::multiprocessor_shared_memory_bank_size_option_t::four_bytes_per_bank);
 
-	// You may be wondering why we're only setting these "attributes' but not
-	// obtaining their existing values. Well - we can't! The runtime doesn't expose
-	// API calls for that (as of CUDA v8.0).
 
 	// ------------------
 	//  Kernel launching
@@ -85,14 +82,14 @@ int main(int argc, char **argv)
 	const unsigned num_blocks = 3;
 	std::cout << "Getting kernel attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" << std::endl;
 	auto max_threads_per_block = kernel.get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
-	auto launch_config = cuda::make_launch_config(num_blocks, max_threads_per_block);
+	auto launch_config = cuda::launch_configuration_t(num_blocks, max_threads_per_block);
 	std::cout
 		<< "Launching kernel " << kernel_name
 		<< " with " << num_blocks << " blocks, using cuda::launch()" << std::endl;
 	{
 		// Copy and move construction and assignment of launch configurations
-		auto launch_config_2 = cuda::make_launch_config(2, 2, 2);
-		auto launch_config_3 = cuda::make_launch_config(3, 3, 3);
+		auto launch_config_2 = cuda::launch_configuration_t{2, 2};
+		auto launch_config_3 = cuda::launch_configuration_t{3, 3};
 		cuda::launch_configuration_t launch_config_4{launch_config};
 		(void) launch_config_4;
 		launch_config_4 = launch_config_2;

diff --git a/examples/by_api_module/stream_management.cu b/examples/by_api_module/stream_management.cu
@@ -173,7 +173,7 @@ int main(int argc, char **argv)
 	stream_1.enqueue.host_invokable(callback);
 	auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 	auto num_blocks = div_rounding_up(buffer_size, threads_per_block);
-	auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
+	auto launch_config = cuda::launch_configuration_t{num_blocks, threads_per_block};
 	// TODO: The following doesn't have much of a meaningful effect; we should modify this example
 	// so that the attachment has some observable effect
 	stream_1.enqueue.attach_managed_region(buffer.get());

diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
@@ -50,9 +50,10 @@ int main(int, char **)
 
 	sequence_cpu(h_ptr.get(), N);
 
-	auto block_size = 256;
-	auto grid_size = div_rounding_up(N, block_size);
-	auto launch_config = cuda::make_launch_config(grid_size, block_size);
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(N)
+		.block_size(256)
+		.build();
 	device.launch(sequence_gpu, launch_config, d_ptr.get(), N);
 
 	cuda::outstanding_error::ensure_none();

diff --git a/examples/modified_cuda_samples/memMapIPCDrv/child.cpp b/examples/modified_cuda_samples/memMapIPCDrv/child.cpp
@@ -146,16 +146,6 @@ bool results_are_valid(
 	return true;
 }
 
-cuda::launch_configuration_t make_launch_config(const cuda::device_t &device, const cuda::kernel_t &kernel)
-{
-	const int num_threads_per_block = 128;
-	auto max_active_blocks_per_sm = kernel.max_active_blocks_per_multiprocessor(num_threads_per_block, 0);
-	auto num_blocks = max_active_blocks_per_sm * device.multiprocessor_count();
-	auto launch_config = cuda::make_launch_config(num_blocks, num_threads_per_block);
-	return launch_config;
-}
-
-
 void childProcess(int devId, int id_of_this_child, char **)
 {
 	auto device{cuda::device::get(devId)};
@@ -179,7 +169,11 @@ void childProcess(int devId, int id_of_this_child, char **)
 	auto module = cuda::module::create(device, fatbin);
 
 	auto kernel = module.get_kernel(kernel::name);
-	auto launch_config = make_launch_config(device, kernel);
+	auto launch_config = cuda::launch_config_builder()
+		.kernel(&kernel)
+		.block_size(128)
+		.saturate_with_active_blocks().build();
+
 
 	for (int sibling_process_offset = 0; sibling_process_offset < num_processes; sibling_process_offset++) {
 		// Interact with (cyclically) consecutive child processes after

diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -142,7 +142,7 @@ void enqueue_p2p_copy(
         auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
         // Note: We could have alternatively used:
         // auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
-        auto launch_config = cuda::make_launch_config(grid_and_block_dims);
+        auto launch_config = cuda::launch_configuration_t{grid_and_block_dims};
 
         for (int r = 0; r < repeat; r++) {
             stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest, (int4*)src, num_elems/sizeof(int4));
@@ -444,11 +444,11 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
 
             // Block the stream until all the work is queued up
             // DANGER! - cudaMemcpy*Async may infinitely block waiting for
-            // room to push the operation, so keep the number of repeatitions
-            // relatively low.  Higher repeatitions will cause the delay kernel
+            // room to push the operation, so keep the number of repetitions
+            // relatively low.  Higher repetitions will cause the delay kernel
             // to timeout and lead to unstable results.
             *flag = 0;
-            auto single_thread = cuda::make_launch_config(cuda::grid::dimensions_t::point(), cuda::grid::block_dimensions_t::point());
+            auto single_thread = cuda::launch_configuration_t(cuda::grid::dimensions_t::point(), cuda::grid::block_dimensions_t::point());
             streams[i].enqueue.kernel_launch(delay, single_thread, flag, default_timeout_clocks);
             streams[i].enqueue.event(start[i]);
 

diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -155,9 +155,10 @@ int main(int argc, char** argv)
 	cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);
 	cuda::memory::async::copy(d_B.get(), h_B.get(), size, stream);
 
-    auto threadsPerBlock = 256;
-	auto blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
-    auto launch_config = cuda::make_launch_config( blocksPerGrid, threadsPerBlock );
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(N)
+		.block_size(256)
+		.build();
 
     cuda::outstanding_error::ensure_none();
 

diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -108,8 +108,6 @@ void run_simple_streams_example(
 	int nstreams = 4;               // number of streams for CUDA calls
 	int nreps = 10;                // number of times each experiment is repeated; originally 10
 	std::size_t nbytes = params.n * sizeof(int);   // number of data bytes
-	dim3 threads, blocks;           // kernel launch configuration
-
 	int c = 5;                      // value to which the array will be initialized
 
 	// Allocate Host memory
@@ -151,10 +149,10 @@ void run_simple_streams_example(
 	std::cout << "memcopy:\t" << time_memcpy.count() << "\n";
 
 	// time kernel
-	threads=dim3(512, 1);
-	assert_(params.n % threads.x == 0);
-	blocks=dim3(params.n / threads.x, 1);
-	auto launch_config = cuda::make_launch_config(blocks, threads);
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(params.n)
+		.block_size(512)
+		.build();
 	start_event.record();
 	streams[0].enqueue.kernel_launch(init_array, launch_config, d_a.get(), d_c.get(), params.num_iterations);
 	stop_event.record();
@@ -164,9 +162,10 @@ void run_simple_streams_example(
 
 	//////////////////////////////////////////////////////////////////////
 	// time non-streamed execution for reference
-	threads=dim3(512, 1);
-	blocks=dim3(params.n / threads.x, 1);
-	launch_config = cuda::make_launch_config(blocks, threads);
+	launch_config = cuda::launch_config_builder()
+		.overall_size(params.n)
+		.block_size(512)
+		.build();
 	start_event.record();
 
 	for (int k = 0; k < nreps; k++)
@@ -182,9 +181,10 @@ void run_simple_streams_example(
 
 	//////////////////////////////////////////////////////////////////////
 	// time execution with nstreams streams
-	threads=dim3(512,1);
-	blocks=dim3(params.n/(nstreams*threads.x),1);
-	launch_config = cuda::make_launch_config(blocks, threads);
+	launch_config = cuda::launch_config_builder()
+		.overall_size(params.n/nstreams)
+		.block_size(512)
+		.build();
 	// TODO: Avoid need to push and pop here
 	memset(h_a.get(), 255, nbytes);     // set host memory bits to all 1s, for testing correctness
 	// This instruction is actually the only one in our program

diff --git a/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu b/examples/modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu
@@ -44,15 +44,17 @@ int main()
 	std::generate(buffer_B.get(), buffer_B.get() + numElements, generator);
 
 	// Launch the Vector Add CUDA Kernel
-	int threadsPerBlock = 256;
-	int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(numElements)
+		.block_size(256)
+		.build();
+
 	std::cout
-		<< "CUDA kernel launch with " << blocksPerGrid
-		<< " blocks of " << threadsPerBlock << " threads\n";
+		<< "CUDA kernel launch with " << launch_config.dimensions.grid.volume()
+		<< " blocks of " << launch_config.dimensions.block.volume() << " threads\n";
 
 	cuda::launch(
-		vectorAdd,
-		cuda::make_launch_config( blocksPerGrid, threadsPerBlock ),
+		vectorAdd, launch_config,
 		buffer_A.get(), buffer_B.get(), buffer_C.get(), numElements
 	);
 

diff --git a/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu b/examples/modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu
@@ -50,17 +50,19 @@ int main()
 	std::generate(h_B, h_B + numElements, generator);
 
 	// Launch the Vector Add CUDA Kernel
-	int threadsPerBlock = 256;
-	int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(numElements)
+		.block_size(256)
+		.build();
+
 	std::cout
-		<< "CUDA kernel launch with " << blocksPerGrid
-		<< " blocks of " << threadsPerBlock << " threads\n";
+		<< "CUDA kernel launch with " << launch_config.dimensions.grid.volume()
+		<< " blocks of " << launch_config.dimensions.block.volume() << " threads\n";
 
 	cuda::launch(
-		vectorAdd,
-		cuda::make_launch_config( blocksPerGrid, threadsPerBlock ),
-		d_A, d_B, d_C, numElements
-	);
+		vectorAdd, launch_config,
+		d_A, d_B, d_C, numElements);
 
 	// Synchronization is necessary here despite the synchronous nature of the default stream -
 	// since the copying-back of data is not something we've waited for

diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu
@@ -79,9 +79,11 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 		grid::dimension_t( div_rounding_up(h, block_dim) ),
 		grid::dimension_t( div_rounding_up(d, block_dim) )
 	};
+
+	auto launch_config = cuda::launch_configuration_t{grid_dims, block_dims};
 	cuda::launch(
 		kernels::from_3D_texture_to_memory_space,
-		cuda::make_launch_config(grid_dims, block_dims),
+		launch_config,
 		tv.raw_handle(), ptr_out.get(), w, h, d);
 	device.synchronize();
 	check_output_is_iota("copy from 3D texture into (managed) global memory", ptr_out.get(), arr.size());
@@ -115,8 +117,6 @@ void print_2d_array(const char* title, const T* a, size_t width, size_t height)
 
 void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 {
-	namespace grid = cuda::grid;
-
 	const cuda::array::dimensions_t<2> dims = {w, h};
 	auto arr = cuda::array::create<float>(device , dims);
 	auto ptr_in = cuda::memory::managed::make_unique<float[]>(arr.size());
@@ -130,14 +130,12 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 	cuda::texture_view tv(arr);
 
 	constexpr cuda::grid::block_dimension_t block_dim = 10;
-	constexpr auto block_dims = cuda::grid::block_dimensions_t::square(block_dim);
 	assert(div_rounding_up(w, block_dim) <= std::numeric_limits<grid::dimension_t>::max());
 	assert(div_rounding_up(h, block_dim) <= std::numeric_limits<grid::dimension_t>::max());
-	const cuda::grid::dimensions_t grid_dims = {
-		grid::dimension_t( div_rounding_up(w, block_dim) ),
-		grid::dimension_t( div_rounding_up(h, block_dim) ),
-		1
-	};
+	auto launch_config = cuda::launch_config_builder()
+		.overall_dimensions(w, h)
+		.block_dimensions(block_dim, block_dim)
+		.build();
 
     auto ptr_out = cuda::memory::managed::make_unique<float[]>(arr.size());
     // The following is to make it easier to notice if nothing get copied
@@ -147,7 +145,7 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 
 	cuda::launch(
 		kernels::from_2D_texture_to_memory_space,
-		cuda::make_launch_config(grid_dims, block_dims),
+		launch_config,
 		tv.raw_handle(), ptr_out.get(), w, h);
 	cuda::memory::copy(ptr_out.get(), arr);
 	device.synchronize();

diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu
@@ -45,21 +45,6 @@ constexpr I div_rounding_up(I dividend, const I2 divisor) noexcept
     return (dividend / divisor) + !!(dividend % divisor);
 }
 
-/*
- * Produce a launch configuration with one thread covering each element
- */
-cuda::launch_configuration_t make_linear_launch_config(
-    const cuda::device_t  device,
-    size_t                length)
-{
-    auto threads_per_block = device.properties().max_threads_per_block();
-    auto num_blocks =  div_rounding_up(length, threads_per_block);
-    if (num_blocks > std::numeric_limits<cuda::grid::dimension_t>::max()) {
-        throw std::invalid_argument("Specified length exceeds CUDA's support for a linear grid");
-    }
-    return cuda::make_launch_config((cuda::grid::dimensions_t) num_blocks, threads_per_block, cuda::no_dynamic_shared_memory);
-}
-
 struct buffer_set_t {
     cuda::memory::host::unique_ptr<element_t[]> host_lhs;
     cuda::memory::host::unique_ptr<element_t[]> host_rhs;
@@ -115,8 +100,14 @@ int main(int, char **)
     std::generate_n(std::back_inserter(streams), num_kernels,
         [&]() { return device.create_stream(cuda::stream::async); });
 
-    auto common_launch_config = make_linear_launch_config(device, num_elements);
-    auto buffer_size = num_elements * sizeof(element_t);
+    auto common_launch_config = cuda::launch_config_builder()
+		.device(device)
+		.overall_size(num_elements)
+		.use_maximum_linear_block()
+		.build();
+
+
+	auto buffer_size = num_elements * sizeof(element_t);
 
     std::cout
         << "Running " << num_kernels << " sequences of HtoD-kernel-DtoH, in parallel" << std::endl;

diff --git a/examples/other/jitify/jitify.cpp b/examples/other/jitify/jitify.cpp
@@ -160,7 +160,7 @@ void my_kernel(T* data) {
 	T h_data = 5;
 	cuda::memory::copy_single<T>(d_data.get(), &h_data);
 
-	auto single_thread_launch_config = cuda::make_launch_config( cuda::grid::composite_dimensions_t::point());
+	auto single_thread_launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	device.launch(kernel, single_thread_launch_config, d_data.get());
 	cuda::memory::copy_single<T>(&h_data, d_data.get());
 	return are_close(h_data, 125.f);
@@ -247,7 +247,7 @@ void my_kernel2(float const* indata, float* outdata) {
 	T inval = 3.14159f;
 	cuda::memory::copy_single<T>(indata.get(), &inval);
 
-	auto launch_config = cuda::make_launch_config(cuda::grid::composite_dimensions_t::point());
+	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	cuda::launch(my_kernel1, launch_config, indata.get(), outdata.get());
 	cuda::launch(my_kernel2, launch_config, indata.get(), outdata.get());
 
@@ -309,7 +309,7 @@ __global__ void constant_test(int *x) {
 	cuda::memory::copy(b_a, &inval[1]);
 	cuda::memory::copy(c_b_a, &inval[2]);
 	auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
-	auto launch_config = cuda::make_launch_config(cuda::grid::composite_dimensions_t::point());
+	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	cuda::launch(kernel, launch_config, outdata.get());
 	int outval[n_const];
 	cuda::memory::copy(outval, outdata.get(), sizeof(outval));
@@ -341,7 +341,7 @@ bool test_constant_2()
 	auto kernel = module.get_kernel(compilation_result.get_mangling_of(second_kernel_name));
 	int inval[] = {3, 5, 9};
 	cuda::memory::copy(anon_b_a, inval);
-	auto launch_config = cuda::make_launch_config(cuda::grid::composite_dimensions_t::point());
+	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
 	cuda::launch(kernel, launch_config, outdata.get());
 	int outval[n_const];

diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp
@@ -143,24 +143,6 @@ struct launch_configuration_t {
    launch_configuration_t& operator=(launch_configuration_t&&) = default;
 };
 
-/**
- * @brief a named constructor idiom for a @ref launch_config_t
- */
-constexpr launch_configuration_t make_launch_config(
-	grid::composite_dimensions_t grid_and_block_dimensions,
-	memory::shared::size_t      dynamic_shared_memory_size = 0u) noexcept
-{
-	return { grid_and_block_dimensions, dynamic_shared_memory_size };
-}
-
-constexpr launch_configuration_t make_launch_config(
-	grid::dimensions_t         grid_dimensions,
-	grid::block_dimensions_t   block_dimensions,
-	memory::shared::size_t     dynamic_shared_memory_size = 0u) noexcept
-{
-	return { { grid_dimensions, block_dimensions }, dynamic_shared_memory_size };
-}
-
 constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
 {
 	return