Workaround NV bug (#4132)

gfx-rs · Sep 27, 2023 · 398d3a1 · 398d3a1
1 parent 1d0a22b
commit 398d3a1
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 9 deletions.
diff --git a/tests/tests/regression/issue_4122.rs b/tests/tests/regression/issue_4122.rs
@@ -0,0 +1,110 @@
+use std::{num::NonZeroU64, ops::Range};
+
+use wasm_bindgen_test::wasm_bindgen_test;
+use wgpu_test::{initialize_test, TestParameters, TestingContext};
+
+fn fill_test(ctx: &TestingContext, range: Range<u64>, size: u64) -> bool {
+ let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("gpu_buffer"),
+ size,
+ usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
+ mapped_at_creation: false,
+ });
+
+ let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+ label: Some("cpu_buffer"),
+ size,
+ usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+ mapped_at_creation: false,
+ });
+
+ // Initialize the whole buffer with values.
+ let buffer_contents = vec![0xFF_u8; size as usize];
+ ctx.queue.write_buffer(&gpu_buffer, 0, &buffer_contents);
+
+ let mut encoder = ctx
+ .device
+ .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+ label: Some("encoder"),
+ });
+
+ encoder.clear_buffer(
+ &gpu_buffer,
+ range.start,
+ NonZeroU64::new(range.end - range.start),
+ );
+ encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, size);
+
+ ctx.queue.submit(Some(encoder.finish()));
+ cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ());
+ ctx.device.poll(wgpu::Maintain::Wait);
+
+ let buffer_slice = cpu_buffer.slice(..);
+ let buffer_data = buffer_slice.get_mapped_range();
+
+ let first_clear_byte = buffer_data
+ .iter()
+ .enumerate()
+ .find_map(|(index, byte)| (*byte == 0x00).then_some(index))
+ .expect("No clear happened at all");
+
+ let first_dirty_byte = buffer_data
+ .iter()
+ .enumerate()
+ .skip(first_clear_byte)
+ .find_map(|(index, byte)| (*byte != 0x00).then_some(index))
+ .unwrap_or(size as usize);
+
+ let second_clear_byte = buffer_data
+ .iter()
+ .enumerate()
+ .skip(first_dirty_byte)
+ .find_map(|(index, byte)| (*byte == 0x00).then_some(index));
+
+ if second_clear_byte.is_some() {
+ eprintln!("Found multiple cleared ranges instead of a single clear range of {}..{} on a buffer of size {}.", range.start, range.end, size);
+ return false;
+ }
+
+ let cleared_range = first_clear_byte as u64..first_dirty_byte as u64;
+
+ if cleared_range != range {
+ eprintln!(
+ "Cleared range is {}..{}, but the clear range is {}..{} on a buffer of size {}.",
+ cleared_range.start, cleared_range.end, range.start, range.end, size
+ );
+ return false;
+ }
+
+ eprintln!(
+ "Cleared range is {}..{} on a buffer of size {}.",
+ cleared_range.start, cleared_range.end, size
+ );
+
+ true
+}
+
+/// Nvidia has a bug in vkCmdFillBuffer where the clear range is not properly respected under
+/// certain conditions. See https://github.com/gfx-rs/wgpu/issues/4122 for more information.
+///
+/// This test will fail on nvidia if the bug is not properly worked around.
+#[wasm_bindgen_test]
+#[test]
+fn clear_buffer_bug() {
+ initialize_test(TestParameters::default(), |ctx| {
+ // This hits most of the cases in nvidia's clear buffer bug
+ let mut succeeded = true;
+ for power in 4..14 {
+ let size = 1 << power;
+ for start_offset in (0..=36).step_by(4) {
+ for size_offset in (0..=36).step_by(4) {
+ let range = start_offset..size + size_offset + start_offset;
+ let result = fill_test(&ctx, range, 1 << 16);
+
+ succeeded &= result;
+ }
+ }
+ }
+ assert!(succeeded);
+ });
+}
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
@@ -2,6 +2,7 @@ use wasm_bindgen_test::wasm_bindgen_test_configure;
 
 mod regression {
  mod issue_3457;
+ mod issue_4122;
 }
 
 mod buffer;

diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
@@ -982,6 +982,10 @@ impl super::Instance {
  super::Workarounds::EMPTY_RESOLVE_ATTACHMENT_LISTS,
  phd_capabilities.properties.vendor_id == db::qualcomm::VENDOR,
  );
+ workarounds.set(
+ super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16,
+ phd_capabilities.properties.vendor_id == db::nvidia::VENDOR,
+ );
  };
 
  if phd_capabilities.effective_api_version == vk::API_VERSION_1_0

diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs
@@ -197,15 +197,44 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
  }
 
  unsafe fn clear_buffer(&mut self, buffer: &super::Buffer, range: crate::MemoryRange) {
- unsafe {
- self.device.raw.cmd_fill_buffer(
- self.active,
- buffer.raw,
- range.start,
- range.end - range.start,
- 0,
- )
- };
+ let range_size = range.end - range.start;
+ if self.device.workarounds.contains(
+ super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16,
+ ) && range_size >= 4096
+ && range.start % 16 != 0
+ {
+ let rounded_start = wgt::math::align_to(range.start, 16);
+ let prefix_size = rounded_start - range.start;
+
+ unsafe {
+ self.device.raw.cmd_fill_buffer(
+ self.active,
+ buffer.raw,
+ range.start,
+ prefix_size,
+ 0,
+ )
+ };
+
+ // This will never be zero, as rounding can only add up to 12 bytes, and the total size is 4096.
+ let suffix_size = range.end - rounded_start;
+
+ unsafe {
+ self.device.raw.cmd_fill_buffer(
+ self.active,
+ buffer.raw,
+ rounded_start,
+ suffix_size,
+ 0,
+ )
+ };
+ } else {
+ unsafe {
+ self.device
+ .raw
+ .cmd_fill_buffer(self.active, buffer.raw, range.start, range_size, 0)
+ };
+ }
  }
 
  unsafe fn copy_buffer_to_buffer<T>(

diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
@@ -205,6 +205,28 @@ bitflags::bitflags!(
  /// Qualcomm OOMs when there are zero color attachments but a non-null pointer
  /// to a subpass resolve attachment array. This nulls out that pointer in that case.
  const EMPTY_RESOLVE_ATTACHMENT_LISTS = 0x2;
+ /// If the following code returns false, then nvidia will end up filling the wrong range.
+ ///
+ /// ```skip
+ /// fn nvidia_succeeds() -> bool {
+ /// # let (copy_length, start_offset) = (0, 0);
+ /// if copy_length >= 4096 {
+ /// if start_offset % 16 != 0 {
+ /// if copy_length == 4096 {
+ /// return true;
+ /// }
+ /// if copy_length % 16 == 0 {
+ /// return false;
+ /// }
+ /// }
+ /// }
+ /// true
+ /// }
+ /// ```
+ ///
+ /// As such, we need to make sure all calls to vkCmdFillBuffer are aligned to 16 bytes
+ /// if they cover a range of 4096 bytes or more.
+ const FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16 = 0x4;
  }
 );