From fecf2d627b7ab8b3bfd2001d74907f5397e6e5bc Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sun, 10 Mar 2024 01:31:53 +0200 Subject: [PATCH] Fixes #609, #610: Replace the use of our `cuda::dynarray` (i.e. `std::vector`'s, effectively) with `cuda::unique_span`'s, and make sure they're padded by an extra allocated character set to '\0' --- .../clock_nvrtc/clock.cpp | 4 +- src/cuda/api/library.hpp | 2 +- src/cuda/api/module.hpp | 2 +- src/cuda/api/multi_wrapper_impls/memory.hpp | 8 ++- src/cuda/api/types.hpp | 5 -- src/cuda/rtc/compilation_output.hpp | 59 +++++++++++-------- src/cuda/rtc/program.hpp | 7 +-- 7 files changed, 47 insertions(+), 40 deletions(-) diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp index 40b66da8..7bf80379 100644 --- a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp +++ b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp @@ -106,7 +106,7 @@ long double compute_average_elapsed_clocks(const clock_t* timers, std::size_t nu return offset_sum / num_blocks; } -cuda::dynarray compile_to_cubin( +cuda::unique_span compile_to_cubin( const char* kernel_source, const char* kernel_name, cuda::device_t target_device) @@ -134,7 +134,7 @@ int main() auto device_id { 0 }; // Not bothering with supporting a command-line argument here auto device = cuda::device::get(device_id); auto cubin = compile_to_cubin(clock_kernel::source, clock_kernel::name, device); - auto module = cuda::module::create(device, cubin); + auto module = cuda::module::create(device, cubin.get()); auto kernel_in_module = module.get_kernel(clock_kernel::name); cuda::grid::dimension_t num_blocks { 64 }; diff --git a/src/cuda/api/library.hpp b/src/cuda/api/library.hpp index a03eaeb8..26f4294c 100644 --- a/src/cuda/api/library.hpp +++ b/src/cuda/api/library.hpp @@ -63,7 +63,7 @@ ::std::string identify(const library_t &library); * Create a CUDA driver library of compiled code from raw image data. * * @param[in] module_data the opaque, raw binary data for the module - in a contiguous container - * such as a span, a cuda::dynarray etc.. + * such as a span, a cuda::unique_span etc.. */ ///@{ template void set_access_permissions(DeviceRange devices, const pool_t& pool, access_permissions_t permissions) { - cuda::dynarray device_ids(devices.size()); - ::std::transform(::std::begin(devices), ::std::end(devices), device_ids.begin()); - span device_ids_span {device_ids.data(), device_ids.size()}; + // Not depending on unique_span here :-( + auto device_ids = ::std::unique_ptr(new cuda::device::id_t[devices.size()]); + auto device_ids_sp = span(device_ids, devices.size()); + ::std::transform(::std::begin(devices), ::std::end(devices), device_ids_sp.begin()); + span device_ids_span {device_ids_sp.data(), device_ids_sp.size()}; cuda::memory::detail_::set_access_permissions(device_ids_span, pool.handle(), permissions); } #endif // #if CUDA_VERSION >= 11020 diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index 47446f8a..ed5151dd 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -895,11 +895,6 @@ using handle_t = CUfunction; } // namespace kernel -// The C++ standard library doesn't offer ::std::dynarray (although it almost did), -// and we won't introduce our own here. So... -template -using dynarray = ::std::vector; - } // namespace cuda #ifndef __CUDACC__ diff --git a/src/cuda/rtc/compilation_output.hpp b/src/cuda/rtc/compilation_output.hpp index ea052053..cbc08c52 100644 --- a/src/cuda/rtc/compilation_output.hpp +++ b/src/cuda/rtc/compilation_output.hpp @@ -311,23 +311,22 @@ class compilation_output_base_t { return { buffer.data(), size }; } - dynarray log() const + unique_span log() const { size_t size = program::detail_::get_log_size(program_handle_, program_name_.c_str()); - ::std::vector result(size+1); - if (size == 0) { return result; } + auto result = make_unique_span(size+1); // Let's append a trailing nul character, to be on the safe side + if (size == 0) { + result[size] = '\0'; + return result; + } program::detail_::get_log(result.data(), program_handle_, program_name_.c_str()); - // Q: Isn't it kind of "cheating" to use an ::std::vector, then return it as a dynarray? What - // if we get a proper dynarray which doesn't alias ::std::vector? - // A: Well, kind of; it would mean we might have to copy. However - a proper dynarray might - // allow us to construct it with an arbitrary buffer, or a larger dynarray etc. - and - // then we could ensure the allocation happens only once. + result[size] = '\0'; return result; } ///@} #if CUDA_VERSION >= 11010 - virtual dynarray cubin() const = 0; + virtual unique_span cubin() const = 0; virtual bool has_cubin() const = 0; #endif @@ -404,12 +403,16 @@ class compilation_output_t : public compilation_output_base_t ptx() const + unique_span ptx() const { size_t size = program::detail_::get_ptx_size(program_handle_, program_name_.c_str()); - dynarray result(size); - if (size == 0) { return result; } + auto result = make_unique_span(size+1); // Let's append a trailing nul character, to be on the safe side + if (size == 0) { + result[size] = '\0'; + return result; + } program::detail_::get_ptx(result.data(), program_handle_, program_name_.c_str()); + result[size] = '\0'; return result; } ///@} @@ -450,10 +453,10 @@ class compilation_output_t : public compilation_output_base_t cubin() const override + unique_span cubin() const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); - dynarray result(size); + auto result = make_unique_span(size); if (size == 0) { return result; } program::detail_::get_cubin(result.data(), program_handle_, program_name_.c_str()); return result; @@ -500,12 +503,16 @@ class compilation_output_t : public compilation_output_base_t lto_ir() const + unique_span lto_ir() const { size_t size = program::detail_::get_lto_ir_size(program_handle_, program_name_.c_str()); - dynarray result(size); - if (size == 0) { return result; } + auto result = make_unique_span(size+1); // Let's append a trailing nul character, to be on the safe side + if (size == 0) { + result[size] = '\0'; + return result; + } program::detail_::get_lto_ir(result.data(), program_handle_, program_name_.c_str()); + result[size] = '\0'; return result; } /// @} @@ -592,12 +599,16 @@ class compilation_output_t : public compilation_output_base_t { } public: // non-mutators - dynarray cubin() const override + unique_span cubin() const override { size_t size = program::detail_::get_cubin_size(program_handle_, program_name_.c_str()); - dynarray result(size); - if (size == 0) { return result; } + auto result = make_unique_span(size+1); // Let's append a trailing nul character, to be on the safe side + if (size == 0) { + result[size] = '\0'; + return result; + } program::detail_::get_cubin(result.data(), program_handle_, program_name_.c_str()); + result[size] = '\0'; return result; } ///@} @@ -668,15 +679,15 @@ template<> inline module_t create( // Note: The above won't fail even if no CUBIN was produced bool has_cubin = (cubin_size > 0); if (has_cubin) { - dynarray cubin(cubin_size); + auto cubin = make_unique_span(cubin_size); rtc::program::detail_::get_cubin(cubin.data(), program_handle, program_name); - return module::create(context, cubin, options); + return module::create(context, cubin.get(), options); } // Note: At this point, we must have PTX in the output, as otherwise the compilation could // not have succeeded #endif auto ptx = compiled_program.ptx(); - return module::create(context, ptx, options); + return module::create(context, ptx.get(), options); } #if CUDA_VERSION >= 11010 @@ -690,7 +701,7 @@ template<> inline module_t create( + cuda::rtc::program::detail_::identify(compiled_program.program_handle())); } auto cubin = compiled_program.cubin(); - return module::create(context, cubin, options); + return module::create(context, cubin.get(), options); } #endif // CUDA_VERSION >= 11010 diff --git a/src/cuda/rtc/program.hpp b/src/cuda/rtc/program.hpp index ee8b6a51..aa16c583 100644 --- a/src/cuda/rtc/program.hpp +++ b/src/cuda/rtc/program.hpp @@ -604,7 +604,7 @@ inline program_t create(const ::std::string& program_name) } // namespace program #if CUDA_VERSION >= 11020 -inline dynarray +inline unique_span supported_targets() { int num_supported_archs; @@ -613,9 +613,8 @@ supported_targets() auto raw_archs = ::std::unique_ptr(new int[num_supported_archs]); status = nvrtcGetSupportedArchs(raw_archs.get()); throw_if_error(status, "Failed obtaining the architectures supported by NVRTC"); - dynarray result; - result.reserve(num_supported_archs); - ::std::transform(raw_archs.get(), raw_archs.get() + num_supported_archs, ::std::back_inserter(result), + auto result = make_unique_span(num_supported_archs); + ::std::transform(raw_archs.get(), raw_archs.get() + num_supported_archs, ::std::begin(result), [](int raw_arch) { return device::compute_capability_t::from_combined_number(raw_arch); }); return result; }