diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5a9d07ad27..3d02995134 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -74,6 +74,7 @@ By @Valaphee in [#3402](https://github.com/gfx-rs/wgpu/pull/3402)
 #### General
 
 - Derive storage bindings via `naga::StorageAccess` instead of `naga::GlobalUse`. By @teoxoy in [#3985](https://github.com/gfx-rs/wgpu/pull/3985).
+- `Queue::on_submitted_work_done` callbacks will now always be called after all previous `BufferSlice::map_async` callbacks, even when there are no active submissions. By @cwfitzgerald in [#4036](https://github.com/gfx-rs/wgpu/pull/4036).
 
 #### Vulkan
 - Fix enabling `wgpu::Features::PARTIALLY_BOUND_BINDING_ARRAY` not being actually enabled in vulkan backend. By @39ali in[#3772](https://github.com/gfx-rs/wgpu/pull/3772).
diff --git a/Cargo.lock b/Cargo.lock
index 9117ea2316..aa4e692c89 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3459,6 +3459,7 @@ dependencies = [
  "log",
  "naga",
  "nv-flip",
+ "parking_lot",
  "png",
  "pollster",
  "raw-window-handle 0.5.2",
diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index 85dae4959c..5738ed1bdb 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -24,6 +24,7 @@ bytemuck.workspace = true
 cfg-if.workspace = true
 env_logger.workspace = true
 log.workspace = true
+parking_lot.workspace = true
 png.workspace = true
 pollster.workspace = true
 wgpu.workspace = true
diff --git a/tests/tests/regression/issue_4024.rs b/tests/tests/regression/issue_4024.rs
new file mode 100644
index 0000000000..959f58ffa5
--- /dev/null
+++ b/tests/tests/regression/issue_4024.rs
@@ -0,0 +1,91 @@
+use std::sync::Arc;
+
+use parking_lot::Mutex;
+use wgpu_test::{initialize_test, TestParameters};
+
+use wasm_bindgen_test::wasm_bindgen_test;
+use wgpu::*;
+
+/// The WebGPU specification has very specific requirements about the ordering of map_async
+/// and on_submitted_work_done callbacks. Specifically, all map_async callbacks that are initiated
+/// before a given on_submitted_work_done callback must be invoked before the on_submitted_work_done
+/// callback is invoked.
+///
+/// We previously immediately invoked on_submitted_work_done callbacks if there was no active submission
+/// to add them to. This is incorrect, as we do not immediatley invoke map_async callbacks.
+#[wasm_bindgen_test]
+#[test]
+fn queue_submitted_callback_ordering() {
+    initialize_test(TestParameters::default(), |ctx| {
+        // Create a mappable buffer
+        let buffer = ctx.device.create_buffer(&BufferDescriptor {
+            label: Some("mappable buffer"),
+            size: 4,
+            usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        // Encode some work using it. The specifics of this work don't matter, just
+        // that the buffer is used.
+        let mut encoder = ctx
+            .device
+            .create_command_encoder(&CommandEncoderDescriptor {
+                label: Some("encoder"),
+            });
+
+        encoder.clear_buffer(&buffer, 0, None);
+
+        // Submit the work.
+        ctx.queue.submit(Some(encoder.finish()));
+        // Ensure the work is finished.
+        ctx.device.poll(MaintainBase::Wait);
+
+        #[derive(Debug)]
+        struct OrderingContext {
+            /// Incremented every time a callback in invoked.
+            /// This allows the callbacks to know their ordering.
+            counter: u8,
+            /// The value of the counter when the map_async callback was invoked.
+            value_read_map_async: Option<u8>,
+            /// The value of the counter when the queue submitted work done callback was invoked.
+            value_read_queue_submitted: Option<u8>,
+        }
+
+        // Create shared ownership of the ordering context, and clone 2 copies.
+        let ordering = Arc::new(Mutex::new(OrderingContext {
+            counter: 0,
+            value_read_map_async: None,
+            value_read_queue_submitted: None,
+        }));
+        let ordering_clone_map_async = Arc::clone(&ordering);
+        let ordering_clone_queue_submitted = Arc::clone(&ordering);
+
+        // Register the callabacks.
+        buffer.slice(..).map_async(MapMode::Read, move |_| {
+            let mut guard = ordering_clone_map_async.lock();
+            guard.value_read_map_async = Some(guard.counter);
+            guard.counter += 1;
+        });
+
+        // If the bug is present, this callback will be invoked immediately inside this function,
+        // despite the fact there is an outstanding map_async callback.
+        ctx.queue.on_submitted_work_done(move || {
+            let mut guard = ordering_clone_queue_submitted.lock();
+            guard.value_read_queue_submitted = Some(guard.counter);
+            guard.counter += 1;
+        });
+
+        // No GPU work is happening at this point, but we want to process callbacks.
+        ctx.device.poll(MaintainBase::Poll);
+
+        // Extract the ordering out of the arc.
+        let ordering = Arc::try_unwrap(ordering).unwrap().into_inner();
+
+        // There were two callbacks invoked
+        assert_eq!(ordering.counter, 2);
+        // The map async callback was invoked fist
+        assert_eq!(ordering.value_read_map_async, Some(0));
+        // The queue submitted work done callback was invoked second.
+        assert_eq!(ordering.value_read_queue_submitted, Some(1));
+    })
+}
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index a75367f827..86139c9bd2 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -2,6 +2,7 @@ use wasm_bindgen_test::wasm_bindgen_test_configure;
 
 mod regression {
     mod issue_3457;
+    mod issue_4024;
 }
 
 mod buffer;
diff --git a/wgpu-core/src/device/life.rs b/wgpu-core/src/device/life.rs
index 75491d10cb..a12d864104 100644
--- a/wgpu-core/src/device/life.rs
+++ b/wgpu-core/src/device/life.rs
@@ -219,6 +219,9 @@ struct ActiveSubmission<A: hal::Api> {
     mapped: Vec<id::Valid<id::BufferId>>,
 
     encoders: Vec<EncoderInFlight<A>>,
+
+    /// List of queue "on_submitted_work_done" closures to be called once this
+    /// submission has completed.
     work_done_closures: SmallVec<[SubmittedWorkDoneClosure; 1]>,
 }
 
@@ -304,6 +307,12 @@ pub(super) struct LifetimeTracker<A: hal::Api> {
     /// Buffers the user has asked us to map, and which are not used by any
     /// queue submission still in flight.
     ready_to_map: Vec<id::Valid<id::BufferId>>,
+
+    /// Queue "on_submitted_work_done" closures that were initiated for while there is no
+    /// currently pending submissions. These cannot be immeidately invoked as they
+    /// must happen _after_ all mapped buffer callbacks are mapped, so we defer them
+    /// here until the next time the device is maintained.
+    work_done_closures: SmallVec<[SubmittedWorkDoneClosure; 1]>,
 }
 
 impl<A: hal::Api> LifetimeTracker<A> {
@@ -316,6 +325,7 @@ impl<A: hal::Api> LifetimeTracker<A> {
             active: Vec::new(),
             free_resources: NonReferencedResources::new(),
             ready_to_map: Vec::new(),
+            work_done_closures: SmallVec::new(),
         }
     }
 
@@ -405,7 +415,7 @@ impl<A: hal::Api> LifetimeTracker<A> {
             .position(|a| a.index > last_done)
             .unwrap_or(self.active.len());
 
-        let mut work_done_closures = SmallVec::new();
+        let mut work_done_closures: SmallVec<_> = self.work_done_closures.drain(..).collect();
         for a in self.active.drain(..done_count) {
             log::trace!("Active submission {} is done", a.index);
             self.free_resources.extend(a.last_resources);
@@ -445,18 +455,16 @@ impl<A: hal::Api> LifetimeTracker<A> {
         }
     }
 
-    pub fn add_work_done_closure(
-        &mut self,
-        closure: SubmittedWorkDoneClosure,
-    ) -> Option<SubmittedWorkDoneClosure> {
+    pub fn add_work_done_closure(&mut self, closure: SubmittedWorkDoneClosure) {
         match self.active.last_mut() {
             Some(active) => {
                 active.work_done_closures.push(closure);
-                None
             }
-            // Note: we can't immediately invoke the closure, since it assumes
-            // nothing is currently locked in the hubs.
-            None => Some(closure),
+            // We must defer the closure until all previously occuring map_async closures
+            // have fired. This is required by the spec.
+            None => {
+                self.work_done_closures.push(closure);
+            }
         }
     }
 }
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 5792791abe..0ae6d7a2dd 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -181,6 +181,9 @@ impl UserClosures {
     fn fire(self) {
         // Note: this logic is specifically moved out of `handle_mapping()` in order to
         // have nothing locked by the time we execute users callback code.
+
+        // Mappings _must_ be fired before submissions, as the spec requires all mapping callbacks that are registered before
+        // a on_submitted_work_done callback to be fired before the on_submitted_work_done callback.
         for (operation, status) in self.mappings {
             operation.callback.call(status);
         }
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 6e0be3b297..73fa5de3b0 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -1435,17 +1435,12 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         closure: SubmittedWorkDoneClosure,
     ) -> Result<(), InvalidQueue> {
         //TODO: flush pending writes
-        let closure_opt = {
-            let hub = A::hub(self);
-            let mut token = Token::root();
-            let (device_guard, mut token) = hub.devices.read(&mut token);
-            match device_guard.get(queue_id) {
-                Ok(device) => device.lock_life(&mut token).add_work_done_closure(closure),
-                Err(_) => return Err(InvalidQueue),
-            }
-        };
-        if let Some(closure) = closure_opt {
-            closure.call();
+        let hub = A::hub(self);
+        let mut token = Token::root();
+        let (device_guard, mut token) = hub.devices.read(&mut token);
+        match device_guard.get(queue_id) {
+            Ok(device) => device.lock_life(&mut token).add_work_done_closure(closure),
+            Err(_) => return Err(InvalidQueue),
         }
         Ok(())
     }
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index 04f535dd22..0138051a10 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -4580,8 +4580,8 @@ impl Queue {
     }
 
     /// Registers a callback when the previous call to submit finishes running on the gpu. This callback
-    /// being called implies that all mapped buffer callbacks attached to the same submission have also
-    /// been called.
+    /// being called implies that all mapped buffer callbacks which were registered before this call will
+    /// have been called.
     ///
     /// For the callback to complete, either `queue.submit(..)`, `instance.poll_all(..)`, or `device.poll(..)`
     /// must be called elsewhere in the runtime, possibly integrated into an event loop or run on a separate thread.