diff --git a/libs/core/async_cuda/CMakeLists.txt b/libs/core/async_cuda/CMakeLists.txt
index 77782a23bc47..b2a7954877bb 100644
--- a/libs/core/async_cuda/CMakeLists.txt
+++ b/libs/core/async_cuda/CMakeLists.txt
@@ -36,7 +36,7 @@ set(async_cuda_compat_headers
 # cmake-format: on
 
 set(async_cuda_sources cuda_event_callback.cpp cuda_future.cpp cuda_target.cpp
-                       get_targets.cpp
+                       get_targets.cpp cuda_event.cpp
 )
 
 if(HPX_WITH_HIP AND TARGET roc::hipblas)
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index f2ae7ef59f5e..ce4db339969f 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -1,12 +1,19 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2020 Teodor Nikolov
 //
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #pragma once
 
+#include <cstddef>
+#include <deque>
+
+#include <hpx/assert.hpp>
 #include <hpx/async_cuda/cuda_exception.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 #include <hpx/concurrency/stack.hpp>
@@ -19,55 +26,90 @@ namespace hpx { namespace cuda { namespace experimental {
     // of them at startup.
     struct cuda_event_pool
     {
-        static constexpr int initial_events_in_pool = 128;
-
-        static cuda_event_pool& get_event_pool()
-        {
-            static cuda_event_pool event_pool_;
-            return event_pool_;
-        }
+        static constexpr std::size_t initial_events_in_pool = 128;
 
-        // create a bunch of events on initialization
-        cuda_event_pool()
-          : free_list_(initial_events_in_pool)
-        {
-            for (int i = 0; i < initial_events_in_pool; ++i)
-            {
-                add_event_to_pool();
-            }
-        }
+        HPX_CORE_EXPORT static cuda_event_pool& get_event_pool();
 
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            cudaEvent_t event;
-            bool ok = true;
-            while (ok)
+            HPX_ASSERT_MSG(free_lists_.size() ==
+                    static_cast<std::size_t>(max_number_devices_),
+                "Number of CUDA event pools does not match the number of "
+                "devices!");
+            for (int device = 0; device < max_number_devices_; device++)
             {
-                ok = free_list_.pop(event);
-                if (ok)
-                    check_cuda_error(cudaEventDestroy(event));
+                check_cuda_error(cudaSetDevice(device));
+                cudaEvent_t event;
+                bool ok = true;
+                while (ok)
+                {
+                    ok = free_lists_[device].pop(event);
+                    if (ok)
+                        check_cuda_error(cudaEventDestroy(event));
+                }
             }
         }
 
-        inline bool pop(cudaEvent_t& event)
+        inline bool pop(cudaEvent_t& event, int device = 0)
         {
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
             // pop an event off the pool, if that fails, create a new one
-            while (!free_list_.pop(event))
+            while (!free_lists_[device].pop(event))
             {
-                add_event_to_pool();
+                // Save current device
+                int original_device = -1;
+                check_cuda_error(cudaGetDevice(&original_device));
+                add_event_to_pool(device);
+                // reset to original device if required
+                if (original_device != device)
+                    check_cuda_error(cudaSetDevice(original_device));
             }
             return true;
         }
 
-        inline bool push(cudaEvent_t event)
+        inline bool push(cudaEvent_t event, int device = 0)
         {
-            return free_list_.push(event);
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
+            return free_lists_[device].push(event);
         }
 
+        // delete copy / move constructors
+        cuda_event_pool(cuda_event_pool&&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool&&) = delete;
+        cuda_event_pool(cuda_event_pool const&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool const&) = delete;
+
     private:
-        void add_event_to_pool()
+        // Private singleton constructor. Creates a initial_events_in_pool of
+        // events on initialization
+        cuda_event_pool()
+          : max_number_devices_(0)
+        {
+            check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
+            HPX_ASSERT_MSG(max_number_devices_ > 0,
+                "CUDA polling enabled and called, yet no CUDA device found!");
+            int original_device = 0;
+            check_cuda_error(cudaGetDevice(&original_device));
+            for (int device = 0; device < max_number_devices_; device++)
+            {
+                check_cuda_error(cudaSetDevice(device));
+                free_lists_.emplace_back(initial_events_in_pool);
+                for (std::size_t i = 0; i < initial_events_in_pool; ++i)
+                {
+                    add_event_to_pool(device);
+                }
+            }
+            // Restore original device -- avoids confusion about HPX switching
+            // devices for the current thread if only one device is used.
+            check_cuda_error(cudaSetDevice(original_device));
+        }
+
+        void add_event_to_pool(int device)
         {
+            check_cuda_error(cudaSetDevice(device));
             cudaEvent_t event;
             // Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
             // Timing is disabled for performance. [1]
@@ -75,10 +117,11 @@ namespace hpx { namespace cuda { namespace experimental {
             // [1]: CUDA Runtime API, section 5.5 cuda_event Management
             check_cuda_error(
                 cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-            free_list_.push(event);
+            free_lists_[device].push(event);
         }
+        int max_number_devices_;
 
-        // pool is dynamically sized and can grow if needed
-        hpx::lockfree::stack<cudaEvent_t> free_list_;
+        // One pool per GPU - each pool is dynamically sized and can grow if needed
+        std::deque<hpx::lockfree::stack<cudaEvent_t>> free_lists_;
     };
 }}}    // namespace hpx::cuda::experimental
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 042b8199dbeb..e5e617c47171 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Thomas Heller
 //  Copyright (c) 2016 Hartmut Kaiser
@@ -5,6 +6,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #pragma once
 
@@ -81,7 +84,7 @@ namespace hpx { namespace cuda { namespace experimental {
             future_data() {}
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream)
+                cudaStream_t stream, int device)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
             {
@@ -104,7 +107,7 @@ namespace hpx { namespace cuda { namespace experimental {
                                     status)));
                         }
                     },
-                    stream);
+                    stream, device);
             }
         };
 
@@ -183,7 +186,8 @@ namespace hpx { namespace cuda { namespace experimental {
         // main API call to get a future from a stream using allocator, and the
         // specified mode
         template <typename Allocator, typename Mode>
-        hpx::future<void> get_future(Allocator const& a, cudaStream_t stream)
+        hpx::future<void> get_future(
+            Allocator const& a, cudaStream_t stream, int device)
         {
             using shared_state = future_data<Allocator, Mode>;
 
@@ -200,7 +204,19 @@ namespace hpx { namespace cuda { namespace experimental {
             unique_ptr p(traits::allocate(alloc, 1),
                 hpx::util::allocator_deleter<other_allocator>{alloc});
 
-            traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream);
+            static_assert(std::is_same_v<Mode, event_mode> ||
+                    std::is_same_v<Mode, callback_mode>,
+                "get_future mode not supported!");
+            if constexpr (std::is_same_v<Mode, event_mode>)
+            {
+                traits::construct(
+                    alloc, p.get(), init_no_addref{}, alloc, stream, device);
+            }
+            else if constexpr (std::is_same_v<Mode, callback_mode>)
+            {
+                traits::construct(
+                    alloc, p.get(), init_no_addref{}, alloc, stream);
+            }
 
             return hpx::traits::future_access<future<void>>::create(
                 p.release(), false);
@@ -212,16 +228,20 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_callback(
             Allocator const& a, cudaStream_t stream)
         {
-            return get_future<Allocator, callback_mode>(a, stream);
+            // device id 0 will be dropped in callback mode - can be
+            // an arbitrary number here
+            return get_future<Allocator, callback_mode>(a, stream, 0);
         }
 
         // -------------------------------------------------------------
         // main API call to get a future from a stream using allocator
         template <typename Allocator>
         hpx::future<void> get_future_with_event(
-            Allocator const& a, cudaStream_t stream)
+            Allocator const& a, cudaStream_t stream, int device = -1)
         {
-            return get_future<Allocator, event_mode>(a, stream);
+            if (device == -1)
+                check_cuda_error(cudaGetDevice(&device));
+            return get_future<Allocator, event_mode>(a, stream, device);
         }
 
         // -------------------------------------------------------------
@@ -231,7 +251,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
         // -------------------------------------------------------------
         // non allocator version of : get future with an event set
-        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t);
+        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
+            cudaStream_t stream, int device = -1);
     }    // namespace detail
 }}}      // namespace hpx::cuda::experimental
 
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
index fde4361569d5..e1e6ef49cb20 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
@@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         hpx::move_only_function<void(cudaError_t)>;
 
     HPX_CORE_EXPORT void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream);
+        event_callback_function_type&& f, cudaStream_t stream, int device = 0);
 
     HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
     HPX_CORE_EXPORT void unregister_polling(
diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp
new file mode 100644
index 000000000000..1425937a8f29
--- /dev/null
+++ b/libs/core/async_cuda/src/cuda_event.cpp
@@ -0,0 +1,19 @@
+//  Copyright (c) 2023 Gregor Daiß
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
+
+#include <hpx/assert.hpp>
+#include <hpx/async_cuda/cuda_event.hpp>
+#include <hpx/async_cuda/custom_gpu_api.hpp>
+
+namespace hpx { namespace cuda { namespace experimental {
+    cuda_event_pool& cuda_event_pool::get_event_pool()
+    {
+        static cuda_event_pool event_pool_;
+        return event_pool_;
+    }
+}}}    // namespace hpx::cuda::experimental
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index 0d53bd4af3f7..90698b402a98 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2021 ETH Zurich
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Hartmut Kaiser
@@ -6,6 +7,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/config.hpp>
 #include <hpx/assert.hpp>
@@ -50,6 +53,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     {
         cudaEvent_t event;
         event_callback_function_type f;
+        int device;
     };
 
     using event_callback_queue_type =
@@ -104,17 +108,18 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     }
 
     void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream)
+        event_callback_function_type&& f, cudaStream_t stream, int device)
     {
         cudaEvent_t event;
-        if (!cuda_event_pool::get_event_pool().pop(event))
+        if (!cuda_event_pool::get_event_pool().pop(event, device))
         {
             HPX_THROW_EXCEPTION(hpx::error::invalid_status,
                 "add_event_callback", "could not get an event");
         }
         check_cuda_error(cudaEventRecord(event, stream));
 
-        detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)});
+        detail::add_to_event_callback_queue(
+            event_callback{event, HPX_MOVE(f), device});
     }
 
     // Background progress function for async CUDA operations. Checks for completed
@@ -177,7 +182,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                         "active events",
                         debug::dec<3>(get_number_of_active_events()));
                     continuation.f(status);
-                    pool.push(HPX_MOVE(continuation.event));
+                    pool.push(
+                        HPX_MOVE(continuation.event), continuation.device);
                     return true;
                 }),
             event_callback_vector.end());
@@ -199,7 +205,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                     "active events",
                     debug::dec<3>(get_number_of_active_events()));
                 continuation.f(status);
-                pool.push(HPX_MOVE(continuation.event));
+                pool.push(HPX_MOVE(continuation.event), continuation.device);
             }
         }
 
diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp
index 6b1dfbbaa436..accb87e1ff89 100644
--- a/libs/core/async_cuda/src/cuda_future.cpp
+++ b/libs/core/async_cuda/src/cuda_future.cpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Hartmut Kaiser
 //  Copyright (c) 2016 Thomas Heller
@@ -5,6 +6,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/allocator_support/internal_allocator.hpp>
 #include <hpx/assert.hpp>
@@ -18,8 +21,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
             hpx::util::internal_allocator<>{}, stream);
     }
 
-    hpx::future<void> get_future_with_event(cudaStream_t stream)
+    hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
     {
-        return get_future_with_event(hpx::util::internal_allocator<>{}, stream);
+        return get_future_with_event(
+            hpx::util::internal_allocator<>{}, stream, device);
     }
 }}}}    // namespace hpx::cuda::experimental::detail
diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp
index 6906308ebc62..60ea190980f0 100644
--- a/libs/core/async_cuda/src/cuda_target.cpp
+++ b/libs/core/async_cuda/src/cuda_target.cpp
@@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
     hpx::future<void> target::get_future_with_event() const
     {
-        return detail::get_future_with_event(handle_.get_stream());
+        return detail::get_future_with_event(
+            handle_.get_stream(), handle_.get_device());
     }
 
     hpx::future<void> target::get_future_with_callback() const
diff --git a/libs/core/async_cuda/tests/unit/CMakeLists.txt b/libs/core/async_cuda/tests/unit/CMakeLists.txt
index ce6cf5a19ee2..b14933107024 100644
--- a/libs/core/async_cuda/tests/unit/CMakeLists.txt
+++ b/libs/core/async_cuda/tests/unit/CMakeLists.txt
@@ -4,16 +4,19 @@
 # Distributed under the Boost Software License, Version 1.0. (See accompanying
 # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-set(tests cuda_future transform_stream)
+set(tests cuda_future cuda_multi_device_polling transform_stream)
 if(HPX_WITH_GPUBLAS)
   set(benchmarks ${benchmarks} cublas_matmul)
 endif()
 
 set(cublas_matmul_PARAMETERS THREADS_PER_LOCALITY 4)
 set(cuda_future_PARAMETERS THREADS_PER_LOCALITY 4)
+set(cuda_multi_device_polling_PARAMETERS THREADS_PER_LOCALITY 4)
 set(transform_stream_PARAMETERS THREADS_PER_LOCALITY 4)
 
 set(cuda_future_CUDA_SOURCE saxpy trivial_demo)
+set(cuda_multi_device_polling_CUDA_SOURCE trivial_demo)
+
 set(transform_stream_CUDA ON)
 
 foreach(test ${tests})
diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
new file mode 100644
index 000000000000..ca1ee5579c7f
--- /dev/null
+++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
@@ -0,0 +1,105 @@
+//  Copyright (c) 2023 Gregor Daiß
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
+
+#include <hpx/algorithm.hpp>
+#include <hpx/assert.hpp>
+#include <hpx/execution.hpp>
+#include <hpx/future.hpp>
+#include <hpx/init.hpp>
+#include <hpx/modules/async_cuda.hpp>
+#include <hpx/modules/testing.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+// -------------------------------------------------------------------------
+// This example is similar to the unit/cuda_future.cpp example (hence it also uses
+// the externally defined cuda_trivial_kernel. See unit/cuda_future.cpp for
+// more details regarding this.
+//
+// This example extends unit/cuda_future.cpp by testing the cuda event polling
+// on multiple devices (if available)! If the polling is not working correctly,
+// the test will time out as some of the futures are never triggered.
+
+template <typename T>
+extern void cuda_trivial_kernel(T, cudaStream_t stream);
+
+extern void launch_saxpy_kernel(
+    hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks,
+    unsigned int& threads, void** args);
+
+// -------------------------------------------------------------------------
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    // install cuda future polling handler
+    hpx::cuda::experimental::enable_user_polling poll("default");
+
+    // Print all targets for debug purposes
+    hpx::cuda::experimental::print_local_targets();
+
+    int number_devices = 0;
+    hpx::cuda::experimental::check_cuda_error(
+        cudaGetDeviceCount(&number_devices));
+    HPX_ASSERT(number_devices > 0);
+
+    // Check if the futures complete when using executors on all devices
+    std::vector<hpx::shared_future<void>> futs(number_devices);
+    for (auto device_id = 0; device_id < number_devices; device_id++)
+    {
+        hpx::cuda::experimental::cuda_executor exec(
+            device_id, hpx::cuda::experimental::event_mode{});
+        auto fut = hpx::async(exec, cuda_trivial_kernel<float>,
+            static_cast<float>(device_id) + 1);
+        futs[device_id] = fut.then([device_id](hpx::future<void>&&) {
+            std::cout << "Continuation for kernel future triggered on device "
+                         "executor "
+                      << device_id << std::endl;
+        });
+    }
+    auto final_fut = hpx::when_all(futs);
+    std::cout << "All executor test kernels launched! " << std::endl;
+    final_fut.get();
+    std::cout << "All executor test kernels finished! " << std::endl;
+
+    // Test to see if HPX correctly picks up the current device in case
+    // get_future_with_event is not given a device_id
+    for (auto device_id = 0; device_id < number_devices; device_id++)
+    {
+        hpx::cuda::experimental::check_cuda_error(cudaSetDevice(device_id));
+        cudaStream_t device_stream;
+        hpx::cuda::experimental::check_cuda_error(
+            cudaStreamCreate(&device_stream));
+        cuda_trivial_kernel<float>(
+            number_devices + device_id + 1, device_stream);
+        auto fut = hpx::cuda::experimental::detail::get_future_with_event(
+            device_stream);
+        fut.get();
+        std::cout << "get_future_with_event default ID test finished on device "
+                  << device_id << std::endl;
+        hpx::cuda::experimental::check_cuda_error(
+            cudaStreamDestroy(device_stream));
+    }
+
+    return hpx::local::finalize();
+}
+
+// -------------------------------------------------------------------------
+int main(int argc, char** argv)
+{
+    std::cout << "[HPX Cuda multi device polling] - Starting...\n" << std::endl;
+
+    hpx::local::init_params init_args;
+
+    auto result = hpx::local::init(hpx_main, argc, argv, init_args);
+    return result || hpx::util::report_errors();
+}