From 78f30f61d7305852e9f6879b89f8cfab83680f12 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Mon, 5 Feb 2024 20:53:55 +0200
Subject: [PATCH] Fixes #582, fixes #564:

* launch config <-> device validation now checks for block cooperation support when that's requested
* Refactored and re-located some of the launch config validation code
* Added: `device_t` method for checking block cooperation support
* Now properly validating grid dimensions to ensure we don't exceed the maxima
* Made sure the code paths inwards from the non-detail_ launching functions to the actual CUDA API calls all have appropriate validation calls
* In the error_handling example - now making the faulty launch configuration device-specific, otherwise we don't apply the valid-block-size limit
* Comment and spacing tweaks
---
 examples/by_api_module/error_handling.cu      |   1 +
 src/cuda/api/device.hpp                       |  11 ++
 src/cuda/api/kernel_launch.hpp                |  24 +--
 src/cuda/api/launch_config_builder.hpp        |  46 ++++-
 src/cuda/api/launch_configuration.hpp         |  12 +-
 .../api/multi_wrapper_impls/kernel_launch.hpp | 182 ++++++++++++++----
 .../launch_configuration.hpp                  |  30 ++-
 src/cuda/api/types.hpp                        |   8 +
 8 files changed, 229 insertions(+), 85 deletions(-)
diff --git a/examples/by_api_module/error_handling.cu b/examples/by_api_module/error_handling.cu
index f52c2d8e..73a1ad20 100644
--- a/examples/by_api_module/error_handling.cu
+++ b/examples/by_api_module/error_handling.cu
@@ -38,6 +38,7 @@ int main(int, char **)
 	bool got_expected_exception = false;
 	try {
 		cuda::launch_configuration_t lc = cuda::launch_config_builder()
+			.device(device)
 			.overall_size(2048)
 			.block_dimensions(15000) // Note: higher than the possible maximum for know CUDA devices
 			.build();
diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp
index e7469d66..36967143 100644
--- a/src/cuda/api/device.hpp
+++ b/src/cuda/api/device.hpp
@@ -357,6 +357,17 @@ class device_t {
 		return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
 	}
 
+#if CUDA_VERSION >= 12000
+	/**
+	 * True if this device supports "clusters" of grid blocks,
+	 * which can pool their shared memory together
+	 */
+	bool supports_block_clustering() const
+	{
+		return get_attribute(CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH);
+	}
+#endif
+
 #if CUDA_VERSION >= 11020
 	/**
 	 * True if this device supports executing kernels in which blocks can
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
index 51657d19..6fe2dfe8 100644
--- a/src/cuda/api/kernel_launch.hpp
+++ b/src/cuda/api/kernel_launch.hpp
@@ -178,9 +178,6 @@ void enqueue_raw_kernel_launch_in_current_context(
 	using decayed_kf_type = typename ::std::decay<KernelFunction>::type;
 	static_assert(::std::is_function<decayed_kf_type>::value or is_function_ptr<decayed_kf_type>::value,
 		"Only a bona fide function can be launched as a CUDA kernel");
-#ifndef NDEBUG
-	validate(launch_configuration);
-#endif
 	if (not launch_configuration.has_nondefault_attributes()) {
 		// regular plain vanilla launch
 		kernel_function <<<
@@ -323,26 +320,7 @@ void enqueue_launch(
 	Kernel&&                kernel,
 	const stream_t&         stream,
 	launch_configuration_t  launch_configuration,
-	KernelParameters&&...   parameters)
-{
-	static_assert(
-		detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
-		"All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
-	static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
-#if CUDA_VERSION >= 12000
-	static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
-#else
-	static constexpr const bool library_kernel = false;
-#endif // CUDA_VERSION >= 12000
-	// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
-	// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
-	// function for each of them will basically be just a one-liner :-(
-	detail_::enqueue_launch<Kernel, KernelParameters...>(
-		detail_::bool_constant<wrapped_contextual_kernel>{},
-		detail_::bool_constant<library_kernel>{},
-		::std::forward<Kernel>(kernel), stream, launch_configuration,
-		::std::forward<KernelParameters>(parameters)...);
-}
+	KernelParameters&&...   parameters);
 
 /**
  * Variant of @ref enqueue_launch for use with the default stream in the current context.
diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp
index 471541ee..a1b960ad 100644
--- a/src/cuda/api/launch_config_builder.hpp
+++ b/src/cuda/api/launch_config_builder.hpp
@@ -1,9 +1,9 @@
 /**
  * @file
  *
- * @brief Contains the @ref launch
+ * @brief Contains the @ref `cuda::launch_config_builder_t` class definition
  *
- * @note Launch configurations are  used mostly in @ref kernel_launch.hpp . 
+ * @note Launch configurations are used mostly in @ref `kernel_launch.hpp`.
  */
 
 #pragma once
@@ -178,6 +178,7 @@ class launch_config_builder_t {
 
 	struct {
 		optional<grid::block_dimensions_t  > block;
+		optional<grid::dimensions_t        > block_cluster;
 		optional<grid::dimensions_t        > grid;
 		optional<grid::overall_dimensions_t> overall;
 	} dimensions_;
@@ -188,7 +189,7 @@ class launch_config_builder_t {
 	// but the semantic is that if the determiner is not null, we use it;
 	// and if you want to force a concrete apriori value, then you nullify
 	// the determiner
-	kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ {nullptr };
+	kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
 	memory::shared::size_t dynamic_shared_memory_size_ { 0 };
 
 	const kernel_t* kernel_ { nullptr };
@@ -224,7 +225,7 @@ class launch_config_builder_t {
 		memory::shared::size_t  shared_mem_size)
 	{
 		if (kernel_ptr == nullptr) { return; }
-		detail_::validate_compatibility(*kernel_ptr, shared_mem_size);
+		detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
 	}
 
 	static void validate_compatibility(
@@ -232,7 +233,7 @@ class launch_config_builder_t {
 		memory::shared::size_t shared_mem_size)
 	{
 		if (not maybe_device_id) { return; }
-		detail_::validate_compatibility(device(maybe_device_id), shared_mem_size);
+		detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
 	}
 
 	void validate_dynamic_shared_memory_size(memory::shared::size_t size)
@@ -269,6 +270,15 @@ class launch_config_builder_t {
 		validate_block_dimension_compatibility(device_, block_dims);
 	}
 
+
+	static void validate_grid_dimension_compatibility(
+		optional<device::id_t>    maybe_device_id,
+		grid::block_dimensions_t  block_dims)
+	{
+		if (not maybe_device_id) { return; }
+		detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
+	}
+
 	void validate_grid_dimensions(grid::dimensions_t grid_dims) const
 	{
 		detail_::validate_grid_dimensions(grid_dims);
@@ -279,6 +289,16 @@ class launch_config_builder_t {
 		// TODO: Check divisibility
 	}
 
+#if CUDA_VERSION >= 12000
+	void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
+	{
+		if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
+			throw ::std::runtime_error("The requested block cluster dimensions do not "
+				"divide the grid dimensions (in blocks)");
+		}
+	}
+#endif // CUDA_VERSION >= 12000
+
 	void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
 	{
 		if (dimensions_.block and dimensions_.grid) {
@@ -309,7 +329,8 @@ class launch_config_builder_t {
 				get_composite_dimensions().block;
 			validate_block_dimension_compatibility(device_id, block_dims);
 		}
-		validate_compatibility(device_id, dynamic_shared_memory_size_);
+		detail_::validate_compatibility(
+			device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
 	}
 
 	void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
@@ -318,7 +339,7 @@ class launch_config_builder_t {
 		validate_block_dimension_compatibility(device_, composite_dims.block);
 
 		// Is there anything to validate regarding the grid dims?
-		validate_block_dimension_compatibility(device_, composite_dims.grid);
+		validate_grid_dimension_compatibility(device_, composite_dims.grid);
 	}
 #endif // ifndef NDEBUG
 
@@ -378,6 +399,17 @@ class launch_config_builder_t {
 		return *this;
 	}
 
+#if CUDA_VERSION >= 12000
+	launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
+	{
+#ifndef NDEBUG
+		validate_cluster_dimensions(cluster_dims);
+#endif
+		dimensions_.block_cluster = cluster_dims;
+		return *this;
+	}
+#endif
+
 	launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
 	{
 #ifndef NDEBUG
diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp
index 17552bc8..455d4704 100644
--- a/src/cuda/api/launch_configuration.hpp
+++ b/src/cuda/api/launch_configuration.hpp
@@ -207,7 +207,7 @@ namespace detail_ {
 
 // Note: This will not check anything related to the device or the kernel
 // with which the launch configuration is to be used
-inline void validate(launch_configuration_t launch_config) noexcept(false)
+inline void validate(const launch_configuration_t& launch_config) noexcept(false)
 {
 	validate_block_dimensions(launch_config.dimensions.block);
 	validate_grid_dimensions(launch_config.dimensions.grid);
@@ -223,15 +223,9 @@ inline void validate_compatibility(
 	//	validate_grid_dimension_compatibility(device, launch_config.dimensions.grid);
 }
 
-inline void validate_compatibility(
+void validate_compatibility(
 	const kernel_t& kernel,
-	launch_configuration_t launch_config) noexcept(false)
-{
-	validate(launch_config);
-	validate_block_dimension_compatibility(kernel, launch_config.dimensions.block);
-	//  Uncomment if we actually get such checks
-	//	validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid);
-}
+	launch_configuration_t launch_config) noexcept(false);
 
 using launch_attribute_index_t = unsigned int;
 
diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
index cb00eb8f..0d34805f 100644
--- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
+++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
@@ -21,9 +21,41 @@
 
 namespace cuda {
 
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	Kernel&&                kernel,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters)
+{
+	static_assert(
+		detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
+		"All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
+	static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
+#if CUDA_VERSION >= 12000
+	static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
+#else
+	static constexpr const bool library_kernel = false;
+#endif // CUDA_VERSION >= 12000
+#ifndef NDEBUG
+	// wrapped kernel and library kernel compatibility with the launch configuration
+	// will be validated further inside, when we differentiate them from raw kernels
+	detail_::validate(launch_configuration);
+#endif
+
+	// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
+	// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
+	// function for each of them will basically be just a one-liner :-(
+	detail_::enqueue_launch<Kernel, KernelParameters...>(
+		detail_::bool_constant<wrapped_contextual_kernel>{},
+		detail_::bool_constant<library_kernel>{},
+		::std::forward<Kernel>(kernel), stream, launch_configuration,
+		::std::forward<KernelParameters>(parameters)...);
+}
+
 namespace detail_ {
 
-inline void validate_compatibility(
+inline void validate_shared_mem_compatibility(
 	const device_t &device,
 	memory::shared::size_t shared_mem_size)
 {
@@ -40,6 +72,61 @@ inline void validate_compatibility(
 	}
 }
 
+inline void validate_compatibility(
+	const device::id_t            device_id,
+	memory::shared::size_t        shared_mem_size,
+	bool                          cooperative_launch,
+	optional<grid::dimensions_t>  block_cluster_dimensions)
+{
+	auto device = device::get(device_id);
+	if (not cooperative_launch or device.supports_block_cooperation()) {
+		throw ::std::runtime_error(device::detail_::identify(device_id)
+			+ " cannot launch kernels with inter-block cooperation");
+	}
+	validate_shared_mem_compatibility(device, shared_mem_size);
+	if (block_cluster_dimensions) {
+#if CUDA_VERSION >= 12000
+		if (not device.supports_block_clustering()) {
+			throw ::std::runtime_error(device::detail_::identify(device_id)
+				+ " cannot launch kernels with inter-block cooperation");
+			// TODO: Uncomment this once the CUDA driver offers info on the maximum
+			// cluster size...
+			//
+			// auto max_cluster_size = ???;
+			// auto cluster_size = block_cluster_dimensions.value().volume();
+			// if (cluster_size > max_cluster_size) {
+			// 	throw ::std::runtime_error(device::detail_::identify(device_id)
+			// 		+ " only supports as many as " + ::std::to_string(max_cluster_size)
+			// 		+ "blocks per block-cluster, but " + ::std::to_string(cluster_size));
+		}
+#else
+		throw ::std::runtime_error("Block clusters are not supported with CUDA versions earlier than 12.0");
+#endif // CUDA_VERSION >= 12000
+	}
+
+	// The CUDA driver does not offer us information with which we could check the validity
+	// of trying a programmatically dependent launch, or a programmatic completion event,
+	// or the use of a "remote" memory synchronization domain. So, assuming that's all valid
+}
+
+template <typename Dims>
+inline void validate_any_dimensions_compatibility(const device_t &device, Dims dims, Dims maxima, const char* kind)
+{
+	auto device_id = device.id();
+	auto check =
+		[device_id, kind](grid::dimension_t dim, grid::dimension_t max, const char *axis) {
+			if (max < dim) {
+				throw ::std::invalid_argument(
+					::std::string("specified ") + kind + " " + axis + "-axis dimension " + ::std::to_string(dim)
+					+ " exceeds the maximum supported " + axis + " dimension of " + ::std::to_string(max)
+					+ " for " + device::detail_::identify(device_id));
+			}
+		};
+	check(dims.x, maxima.x, "X");
+	check(dims.y, maxima.y, "Y");
+	check(dims.z, maxima.z, "Z");
+}
+
 inline void validate_block_dimension_compatibility(
 	const device_t &device,
 	grid::block_dimensions_t block_dims)
@@ -52,27 +139,28 @@ inline void validate_block_dimension_compatibility(
 			+ ", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
 			+ " for " + device::detail_::identify(device.id()));
 	}
-	auto dim_maxima = grid::block_dimensions_t{
+	auto maxima = grid::block_dimensions_t{
 		static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)),
 		static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)),
 		static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
 	};
-	auto device_id = device.id();
-	auto check =
-		[device_id](grid::block_dimension_t dim, grid::block_dimension_t max, const char *axis) {
-			if (max < dim) {
-				throw ::std::invalid_argument(
-					::std::string("specified block ") + axis + "-axis dimension " + ::std::to_string(dim)
-					+ " exceeds the maximum supported " + axis + " dimension of " + ::std::to_string(max)
-					+ " for " + device::detail_::identify(device_id));
-			}
-		};
-	check(block_dims.x, dim_maxima.x, "X");
-	check(block_dims.y, dim_maxima.y, "Y");
-	check(block_dims.z, dim_maxima.z, "Z");
+	validate_any_dimensions_compatibility(device, block_dims, maxima, "block");
 }
 
-inline void validate_compatibility(
+inline void validate_grid_dimension_compatibility(
+	const device_t &device,
+	grid::block_dimensions_t block_dims)
+{
+	auto maxima = grid::dimensions_t{
+		static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)),
+		static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)),
+		static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
+	};
+	validate_any_dimensions_compatibility(device, block_dims, maxima, "grid");
+}
+
+
+inline void validate_shared_mem_size_compatibility(
 	const kernel_t& kernel_ptr,
 	memory::shared::size_t shared_mem_size)
 {
@@ -118,11 +206,7 @@ void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::ope
 	//    `KernelParameter` pack may contain some references, arrays and so on - which CUDA
 	//    kernels cannot accept; so we massage those a bit.
 
-#ifndef NDEBUG
-	validate_compatibility(stream.device(), launch_configuration);
-	validate_compatibility(wrapped_kernel, launch_configuration);
-	// validating the configuration unto-itself should happen within the following function:
-#endif
+	// It is assumed arguments were already been validated
 
 	detail_::enqueue_raw_kernel_launch_in_current_context(
 		unwrapped_kernel_function,
@@ -140,7 +224,6 @@ marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
 	return ::std::array<const void*, sizeof...(KernelParameters)> { &parameters... };
 }
 
-
 // Note: The last (valid) element of marshalled_arguments must be null
 inline void enqueue_kernel_launch_by_handle_in_current_context(
 	kernel::handle_t        kernel_function_handle,
@@ -150,9 +233,8 @@ inline void enqueue_kernel_launch_by_handle_in_current_context(
 	launch_configuration_t  launch_config,
 	const void**            marshalled_arguments)
 {
-#ifndef NDEBUG
-	validate(launch_config);
-#endif
+	// It is assumed arguments were already been validated
+
 	status_t status;
 	const auto&lc = launch_config; // alias for brevity
 #if CUDA_VERSION >= 12000
@@ -198,25 +280,30 @@ inline void enqueue_kernel_launch_by_handle_in_current_context(
 
 template<typename... KernelParameters>
 struct enqueue_launch_helper<kernel_t, KernelParameters...> {
+
 	void operator()(
 	const kernel_t&                       wrapped_kernel,
-	const stream_t &                      stream,
+	const stream_t&                       stream,
 	launch_configuration_t                launch_config,
-	KernelParameters &&...                arguments) const
+	KernelParameters&&...                 arguments) const
 	{
-		auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
-		auto function_handle = wrapped_kernel.handle();
-		CAW_SET_SCOPE_CONTEXT(stream.context_handle());
+	// It is assumed arguments were already been validated
+
 #ifndef NDEBUG
-		validate_compatibility(stream.device(), launch_config);
+		if (wrapped_kernel.context() != stream.context()) {
+			throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(wrapped_kernel)
+				+ " on " + stream::detail_::identify(stream) + ": Different contexts"};
+		}
 		validate_compatibility(wrapped_kernel, launch_config);
 #endif
+		auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
+		auto function_handle = wrapped_kernel.handle();
+		CAW_SET_SCOPE_CONTEXT(stream.context_handle());
 
 		enqueue_kernel_launch_by_handle_in_current_context(
 			function_handle, stream.device_id(), stream.context_handle(),
 			stream.handle(), launch_config, marshalled_arguments.data());
-	}
-
+	} // operator()
 };
 
 template<typename RawKernelFunction, typename... KernelParameters>
@@ -228,14 +315,12 @@ void enqueue_launch(
 	launch_configuration_t    launch_configuration,
 	KernelParameters&&...     parameters)
 {
+	// It is assumed arguments were already been validated
+
 	// Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
 	// and not have trouble enqueueing into a stream in another context - it balks at doing so under
 	// certain conditions, so we must place ourselves in the stream's context.
 	CAW_SET_SCOPE_CONTEXT(stream.context_handle());
-#ifndef NDEBUG
-	validate_compatibility(stream.device(), launch_configuration);
-	// validating the configuration without the device should happen within the next function...
-#endif
 	detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
 		kernel_function, stream.device_id(), stream.context_handle(), stream.handle(), launch_configuration,
 		::std::forward<KernelParameters>(parameters)...);
@@ -250,6 +335,15 @@ void enqueue_launch(
 	launch_configuration_t  launch_configuration,
 	KernelParameters&&...   parameters)
 {
+	// It is assumed arguments were already been validated - except for:
+#ifndef NDEBUG
+	if (kernel.context() != stream.context()) {
+		throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel)
+			+ " on " + stream::detail_::identify(stream) + ": Different contexts"};
+	}
+	detail_::validate_compatibility(kernel, launch_configuration);
+#endif // #ifndef NDEBUG
+
 	enqueue_launch_helper<typename ::std::decay<Kernel>::type, KernelParameters...>{}(
 		::std::forward<Kernel>(kernel), stream, launch_configuration,
 		::std::forward<KernelParameters>(parameters)...);
@@ -265,6 +359,10 @@ void enqueue_launch(
 	launch_configuration_t  launch_configuration,
 	KernelParameters&&...   parameters)
 {
+	// Launch configuration is assumed to have been validated separately
+	// from the kernel, and their compatibility will be validated further
+	// inside, against the contextualized kernel
+
 	kernel_t contextualized = cuda::contextualize(kernel, stream.context());
 	enqueue_launch_helper<kernel_t, KernelParameters...> {}(
 		contextualized, stream, launch_configuration,
@@ -280,6 +378,8 @@ inline void launch(
 	launch_configuration_t  launch_configuration,
 	KernelParameters&&...   parameters)
 {
+	// Argument validation will occur within call to enqueue_launch
+
 	auto primary_context = detail_::get_implicit_primary_context(::std::forward<Kernel>(kernel));
 	auto stream = primary_context.default_stream();
 
@@ -296,11 +396,18 @@ inline void launch_type_erased(
 	launch_configuration_t  launch_configuration,
 	SpanOfConstVoidPtrLike  marshalled_arguments)
 {
+	// Note: We assume that kernel, stream and launch_configuration have already been validated.
 	static_assert(
 		::std::is_same<typename SpanOfConstVoidPtrLike::value_type, void*>::value or
 		::std::is_same<typename SpanOfConstVoidPtrLike::value_type, const void*>::value,
 		"The element type of the marshalled arguments container type must be either void* or const void*");
 #ifndef NDEBUG
+	if (kernel.context() != stream.context()) {
+		throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel)
+			+ " on " + stream::detail_::identify(stream) + ": Different contexts"};
+	}
+	detail_::validate_compatibility(kernel, launch_configuration);
+	detail_::validate(launch_configuration);
 	if (*(marshalled_arguments.end() - 1) != nullptr) {
 		throw ::std::invalid_argument("marshalled arguments for a kernel launch must end with a nullptr element");
 	}
@@ -323,6 +430,7 @@ void launch_type_erased(
 	launch_configuration_t    launch_configuration,
 	SpanOfConstVoidPtrLike    marshalled_arguments)
 {
+	// Argument validation will occur inside the call to launch_type_erased
 	auto contextualized = contextualize(kernel, stream.context());
 	launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
 }
diff --git a/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp b/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp
index b4b4c3bd..670c4ead 100644
--- a/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp
+++ b/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp
@@ -9,45 +9,57 @@
 #define CUDA_API_WRAPPERS_MULTI_WRAPPERS_LAUNCH_CONFIGURATION_HPP
 
 #include "../launch_configuration.hpp"
+#include "../kernel.hpp"
 
 namespace cuda {
 
 namespace detail_ {
 
+inline void validate_compatibility(
+	const kernel_t& kernel,
+	launch_configuration_t launch_config) noexcept(false)
+{
+	validate(launch_config);
+	validate_block_dimension_compatibility(kernel, launch_config.dimensions.block);
+	//  Uncomment if we actually get such checks
+	//	validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid);
+	validate_compatibility(kernel.device(), launch_config);
+}
+
 #if CUDA_VERSION >= 12000
 inline CUlaunchConfig marshal(
 	const launch_configuration_t &config,
 	const stream::handle_t stream_handle,
 	span<CUlaunchAttribute> attribute_storage) noexcept(true)
 {
-	unsigned int attribute_index = 0;
+	unsigned int num_attributes = 0;
 	// TODO: What about CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW ?
 	if (config.block_cooperation) {
-		auto &attr_value = attribute_storage[attribute_index++];
+		auto &attr_value = attribute_storage[num_attributes++];
 		attr_value.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
 		attr_value.value.cooperative = 1;
 	}
 	if (grid::dimensions_t::point() != config.clustering.cluster_dimensions) {
-		auto &attr_value = attribute_storage[attribute_index++];
+		auto &attr_value = attribute_storage[num_attributes++];
 		attr_value.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
 		attr_value.value.clusterDim.x = config.clustering.cluster_dimensions.x;
 		attr_value.value.clusterDim.y = config.clustering.cluster_dimensions.y;
 		attr_value.value.clusterDim.z = config.clustering.cluster_dimensions.z;
 	}
 	if (config.clustering.scheduling_policy != cluster_scheduling_policy_t::default_) {
-		auto &attribute = attribute_storage[attribute_index++];
+		auto &attribute = attribute_storage[num_attributes++];
 		attribute.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
 		attribute.value.clusterSchedulingPolicyPreference =
 			static_cast<CUclusterSchedulingPolicy>(config.clustering.scheduling_policy);
 	}
 	// TODO: CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
 	if (config.programmatically_dependent_launch) {
-		auto &attr_value = attribute_storage[attribute_index++];
+		auto &attr_value = attribute_storage[num_attributes++];
 		attr_value.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
 		attr_value.value.programmaticStreamSerializationAllowed = 1;
 	}
 	if (config.programmatic_completion.event) {
-		auto &attr_value = attribute_storage[attribute_index++];
+		auto &attr_value = attribute_storage[num_attributes++];
 		attr_value.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT;
 		attr_value.value.programmaticEvent.event = config.programmatic_completion.event->handle();
 		// TODO: What about the flags?
@@ -56,12 +68,12 @@ inline CUlaunchConfig marshal(
 	}
 	// What about CU_LAUNCH_ATTRIBUTE_PRIORITY ?
 	if (config.in_remote_memory_synchronization_domain) {
-		auto &attr_value = attribute_storage[attribute_index++];
+		auto &attr_value = attribute_storage[num_attributes++];
 		attr_value.id = CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN;
 		attr_value.value.memSyncDomain = CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE;
 	}
 	// What about CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP ?
-	attribute_storage[attribute_index] = { CU_LAUNCH_ATTRIBUTE_IGNORE, {}};
+	attribute_storage[num_attributes] = {CU_LAUNCH_ATTRIBUTE_IGNORE, {}};
 
 	return {
 		config.dimensions.grid.x,
@@ -73,7 +85,7 @@ inline CUlaunchConfig marshal(
 		config.dynamic_shared_memory_size,
 		stream_handle,
 		attribute_storage.data(),
-		attribute_index
+		num_attributes
 	};
 }
 #endif // CUDA_VERSION >= 12000
diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
index 5c32ad36..5156776d 100644
--- a/src/cuda/api/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -428,6 +428,14 @@ struct dimensions_t // this almost-inherits dim3
 	static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x, 1 }; }
 	static constexpr __host__ __device__ dimensions_t line(dimension_t x)   { return dimensions_t{ x, 1, 1 }; }
 	static constexpr __host__ __device__ dimensions_t point()               { return dimensions_t{ 1, 1, 1 }; }
+
+	static bool divides(dimensions_t lhs, dimensions_t rhs)
+	{
+		return
+			(rhs.x % lhs.x == 0) and
+			(rhs.y % lhs.y == 0) and
+			(rhs.z % lhs.z == 0);
+	}
 };
 
 ///@cond