From 398d3a14cda08e6314dfb3233ef2f51ebbec4ec0 Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Thu, 14 Sep 2023 16:38:21 -0400
Subject: [PATCH] Workaround NV bug (#4132)

---
 tests/tests/regression/issue_4122.rs | 110 +++++++++++++++++++++++++++
 tests/tests/root.rs                  |   1 +
 wgpu-hal/src/vulkan/adapter.rs       |   4 +
 wgpu-hal/src/vulkan/command.rs       |  47 +++++++++---
 wgpu-hal/src/vulkan/mod.rs           |  22 ++++++
 5 files changed, 175 insertions(+), 9 deletions(-)
 create mode 100644 tests/tests/regression/issue_4122.rs
diff --git a/tests/tests/regression/issue_4122.rs b/tests/tests/regression/issue_4122.rs
new file mode 100644
index 0000000000..41b9cd4231
--- /dev/null
+++ b/tests/tests/regression/issue_4122.rs
@@ -0,0 +1,110 @@
+use std::{num::NonZeroU64, ops::Range};
+
+use wasm_bindgen_test::wasm_bindgen_test;
+use wgpu_test::{initialize_test, TestParameters, TestingContext};
+
+fn fill_test(ctx: &TestingContext, range: Range<u64>, size: u64) -> bool {
+    let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("gpu_buffer"),
+        size,
+        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
+        mapped_at_creation: false,
+    });
+
+    let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("cpu_buffer"),
+        size,
+        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+        mapped_at_creation: false,
+    });
+
+    // Initialize the whole buffer with values.
+    let buffer_contents = vec![0xFF_u8; size as usize];
+    ctx.queue.write_buffer(&gpu_buffer, 0, &buffer_contents);
+
+    let mut encoder = ctx
+        .device
+        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("encoder"),
+        });
+
+    encoder.clear_buffer(
+        &gpu_buffer,
+        range.start,
+        NonZeroU64::new(range.end - range.start),
+    );
+    encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, size);
+
+    ctx.queue.submit(Some(encoder.finish()));
+    cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ());
+    ctx.device.poll(wgpu::Maintain::Wait);
+
+    let buffer_slice = cpu_buffer.slice(..);
+    let buffer_data = buffer_slice.get_mapped_range();
+
+    let first_clear_byte = buffer_data
+        .iter()
+        .enumerate()
+        .find_map(|(index, byte)| (*byte == 0x00).then_some(index))
+        .expect("No clear happened at all");
+
+    let first_dirty_byte = buffer_data
+        .iter()
+        .enumerate()
+        .skip(first_clear_byte)
+        .find_map(|(index, byte)| (*byte != 0x00).then_some(index))
+        .unwrap_or(size as usize);
+
+    let second_clear_byte = buffer_data
+        .iter()
+        .enumerate()
+        .skip(first_dirty_byte)
+        .find_map(|(index, byte)| (*byte == 0x00).then_some(index));
+
+    if second_clear_byte.is_some() {
+        eprintln!("Found multiple cleared ranges instead of a single clear range of {}..{} on a buffer of size {}.", range.start, range.end, size);
+        return false;
+    }
+
+    let cleared_range = first_clear_byte as u64..first_dirty_byte as u64;
+
+    if cleared_range != range {
+        eprintln!(
+            "Cleared range is {}..{}, but the clear range is {}..{} on a buffer of size {}.",
+            cleared_range.start, cleared_range.end, range.start, range.end, size
+        );
+        return false;
+    }
+
+    eprintln!(
+        "Cleared range is {}..{} on a buffer of size {}.",
+        cleared_range.start, cleared_range.end, size
+    );
+
+    true
+}
+
+/// Nvidia has a bug in vkCmdFillBuffer where the clear range is not properly respected under
+/// certain conditions. See https://github.com/gfx-rs/wgpu/issues/4122 for more information.
+///
+/// This test will fail on nvidia if the bug is not properly worked around.
+#[wasm_bindgen_test]
+#[test]
+fn clear_buffer_bug() {
+    initialize_test(TestParameters::default(), |ctx| {
+        // This hits most of the cases in nvidia's clear buffer bug
+        let mut succeeded = true;
+        for power in 4..14 {
+            let size = 1 << power;
+            for start_offset in (0..=36).step_by(4) {
+                for size_offset in (0..=36).step_by(4) {
+                    let range = start_offset..size + size_offset + start_offset;
+                    let result = fill_test(&ctx, range, 1 << 16);
+
+                    succeeded &= result;
+                }
+            }
+        }
+        assert!(succeeded);
+    });
+}
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index a84eab3fbe..0db933682b 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -2,6 +2,7 @@ use wasm_bindgen_test::wasm_bindgen_test_configure;
 
 mod regression {
     mod issue_3457;
+    mod issue_4122;
 }
 
 mod buffer;
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index d23aca76a3..1dd3b266e9 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -982,6 +982,10 @@ impl super::Instance {
                 super::Workarounds::EMPTY_RESOLVE_ATTACHMENT_LISTS,
                 phd_capabilities.properties.vendor_id == db::qualcomm::VENDOR,
             );
+            workarounds.set(
+                super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16,
+                phd_capabilities.properties.vendor_id == db::nvidia::VENDOR,
+            );
         };
 
         if phd_capabilities.effective_api_version == vk::API_VERSION_1_0
diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs
index 417367689b..7d64502c13 100644
--- a/wgpu-hal/src/vulkan/command.rs
+++ b/wgpu-hal/src/vulkan/command.rs
@@ -197,15 +197,44 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
     }
 
     unsafe fn clear_buffer(&mut self, buffer: &super::Buffer, range: crate::MemoryRange) {
-        unsafe {
-            self.device.raw.cmd_fill_buffer(
-                self.active,
-                buffer.raw,
-                range.start,
-                range.end - range.start,
-                0,
-            )
-        };
+        let range_size = range.end - range.start;
+        if self.device.workarounds.contains(
+            super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16,
+        ) && range_size >= 4096
+            && range.start % 16 != 0
+        {
+            let rounded_start = wgt::math::align_to(range.start, 16);
+            let prefix_size = rounded_start - range.start;
+
+            unsafe {
+                self.device.raw.cmd_fill_buffer(
+                    self.active,
+                    buffer.raw,
+                    range.start,
+                    prefix_size,
+                    0,
+                )
+            };
+
+            // This will never be zero, as rounding can only add up to 12 bytes, and the total size is 4096.
+            let suffix_size = range.end - rounded_start;
+
+            unsafe {
+                self.device.raw.cmd_fill_buffer(
+                    self.active,
+                    buffer.raw,
+                    rounded_start,
+                    suffix_size,
+                    0,
+                )
+            };
+        } else {
+            unsafe {
+                self.device
+                    .raw
+                    .cmd_fill_buffer(self.active, buffer.raw, range.start, range_size, 0)
+            };
+        }
     }
 
     unsafe fn copy_buffer_to_buffer<T>(
diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
index 047890f5cf..7efb81b10d 100644
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@@ -205,6 +205,28 @@ bitflags::bitflags!(
         /// Qualcomm OOMs when there are zero color attachments but a non-null pointer
         /// to a subpass resolve attachment array. This nulls out that pointer in that case.
         const EMPTY_RESOLVE_ATTACHMENT_LISTS = 0x2;
+        /// If the following code returns false, then nvidia will end up filling the wrong range.
+        ///
+        /// ```skip
+        /// fn nvidia_succeeds() -> bool {
+        ///   # let (copy_length, start_offset) = (0, 0);
+        ///     if copy_length >= 4096 {
+        ///         if start_offset % 16 != 0 {
+        ///             if copy_length == 4096 {
+        ///                 return true;
+        ///             }
+        ///             if copy_length % 16 == 0 {
+        ///                 return false;
+        ///             }
+        ///         }
+        ///     }
+        ///     true
+        /// }
+        /// ```
+        ///
+        /// As such, we need to make sure all calls to vkCmdFillBuffer are aligned to 16 bytes
+        /// if they cover a range of 4096 bytes or more.
+        const FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16 = 0x4;
     }
 );