From 398d3a14cda08e6314dfb3233ef2f51ebbec4ec0 Mon Sep 17 00:00:00 2001 From: Connor Fitzgerald Date: Thu, 14 Sep 2023 16:38:21 -0400 Subject: [PATCH] Workaround NV bug (#4132) --- tests/tests/regression/issue_4122.rs | 110 +++++++++++++++++++++++++++ tests/tests/root.rs | 1 + wgpu-hal/src/vulkan/adapter.rs | 4 + wgpu-hal/src/vulkan/command.rs | 47 +++++++++--- wgpu-hal/src/vulkan/mod.rs | 22 ++++++ 5 files changed, 175 insertions(+), 9 deletions(-) create mode 100644 tests/tests/regression/issue_4122.rs diff --git a/tests/tests/regression/issue_4122.rs b/tests/tests/regression/issue_4122.rs new file mode 100644 index 0000000000..41b9cd4231 --- /dev/null +++ b/tests/tests/regression/issue_4122.rs @@ -0,0 +1,110 @@ +use std::{num::NonZeroU64, ops::Range}; + +use wasm_bindgen_test::wasm_bindgen_test; +use wgpu_test::{initialize_test, TestParameters, TestingContext}; + +fn fill_test(ctx: &TestingContext, range: Range, size: u64) -> bool { + let gpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("gpu_buffer"), + size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }); + + let cpu_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("cpu_buffer"), + size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + + // Initialize the whole buffer with values. + let buffer_contents = vec![0xFF_u8; size as usize]; + ctx.queue.write_buffer(&gpu_buffer, 0, &buffer_contents); + + let mut encoder = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("encoder"), + }); + + encoder.clear_buffer( + &gpu_buffer, + range.start, + NonZeroU64::new(range.end - range.start), + ); + encoder.copy_buffer_to_buffer(&gpu_buffer, 0, &cpu_buffer, 0, size); + + ctx.queue.submit(Some(encoder.finish())); + cpu_buffer.slice(..).map_async(wgpu::MapMode::Read, |_| ()); + ctx.device.poll(wgpu::Maintain::Wait); + + let buffer_slice = cpu_buffer.slice(..); + let buffer_data = buffer_slice.get_mapped_range(); + + let first_clear_byte = buffer_data + .iter() + .enumerate() + .find_map(|(index, byte)| (*byte == 0x00).then_some(index)) + .expect("No clear happened at all"); + + let first_dirty_byte = buffer_data + .iter() + .enumerate() + .skip(first_clear_byte) + .find_map(|(index, byte)| (*byte != 0x00).then_some(index)) + .unwrap_or(size as usize); + + let second_clear_byte = buffer_data + .iter() + .enumerate() + .skip(first_dirty_byte) + .find_map(|(index, byte)| (*byte == 0x00).then_some(index)); + + if second_clear_byte.is_some() { + eprintln!("Found multiple cleared ranges instead of a single clear range of {}..{} on a buffer of size {}.", range.start, range.end, size); + return false; + } + + let cleared_range = first_clear_byte as u64..first_dirty_byte as u64; + + if cleared_range != range { + eprintln!( + "Cleared range is {}..{}, but the clear range is {}..{} on a buffer of size {}.", + cleared_range.start, cleared_range.end, range.start, range.end, size + ); + return false; + } + + eprintln!( + "Cleared range is {}..{} on a buffer of size {}.", + cleared_range.start, cleared_range.end, size + ); + + true +} + +/// Nvidia has a bug in vkCmdFillBuffer where the clear range is not properly respected under +/// certain conditions. See https://github.com/gfx-rs/wgpu/issues/4122 for more information. +/// +/// This test will fail on nvidia if the bug is not properly worked around. +#[wasm_bindgen_test] +#[test] +fn clear_buffer_bug() { + initialize_test(TestParameters::default(), |ctx| { + // This hits most of the cases in nvidia's clear buffer bug + let mut succeeded = true; + for power in 4..14 { + let size = 1 << power; + for start_offset in (0..=36).step_by(4) { + for size_offset in (0..=36).step_by(4) { + let range = start_offset..size + size_offset + start_offset; + let result = fill_test(&ctx, range, 1 << 16); + + succeeded &= result; + } + } + } + assert!(succeeded); + }); +} diff --git a/tests/tests/root.rs b/tests/tests/root.rs index a84eab3fbe..0db933682b 100644 --- a/tests/tests/root.rs +++ b/tests/tests/root.rs @@ -2,6 +2,7 @@ use wasm_bindgen_test::wasm_bindgen_test_configure; mod regression { mod issue_3457; + mod issue_4122; } mod buffer; diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index d23aca76a3..1dd3b266e9 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -982,6 +982,10 @@ impl super::Instance { super::Workarounds::EMPTY_RESOLVE_ATTACHMENT_LISTS, phd_capabilities.properties.vendor_id == db::qualcomm::VENDOR, ); + workarounds.set( + super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16, + phd_capabilities.properties.vendor_id == db::nvidia::VENDOR, + ); }; if phd_capabilities.effective_api_version == vk::API_VERSION_1_0 diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs index 417367689b..7d64502c13 100644 --- a/wgpu-hal/src/vulkan/command.rs +++ b/wgpu-hal/src/vulkan/command.rs @@ -197,15 +197,44 @@ impl crate::CommandEncoder for super::CommandEncoder { } unsafe fn clear_buffer(&mut self, buffer: &super::Buffer, range: crate::MemoryRange) { - unsafe { - self.device.raw.cmd_fill_buffer( - self.active, - buffer.raw, - range.start, - range.end - range.start, - 0, - ) - }; + let range_size = range.end - range.start; + if self.device.workarounds.contains( + super::Workarounds::FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16, + ) && range_size >= 4096 + && range.start % 16 != 0 + { + let rounded_start = wgt::math::align_to(range.start, 16); + let prefix_size = rounded_start - range.start; + + unsafe { + self.device.raw.cmd_fill_buffer( + self.active, + buffer.raw, + range.start, + prefix_size, + 0, + ) + }; + + // This will never be zero, as rounding can only add up to 12 bytes, and the total size is 4096. + let suffix_size = range.end - rounded_start; + + unsafe { + self.device.raw.cmd_fill_buffer( + self.active, + buffer.raw, + rounded_start, + suffix_size, + 0, + ) + }; + } else { + unsafe { + self.device + .raw + .cmd_fill_buffer(self.active, buffer.raw, range.start, range_size, 0) + }; + } } unsafe fn copy_buffer_to_buffer( diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index 047890f5cf..7efb81b10d 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -205,6 +205,28 @@ bitflags::bitflags!( /// Qualcomm OOMs when there are zero color attachments but a non-null pointer /// to a subpass resolve attachment array. This nulls out that pointer in that case. const EMPTY_RESOLVE_ATTACHMENT_LISTS = 0x2; + /// If the following code returns false, then nvidia will end up filling the wrong range. + /// + /// ```skip + /// fn nvidia_succeeds() -> bool { + /// # let (copy_length, start_offset) = (0, 0); + /// if copy_length >= 4096 { + /// if start_offset % 16 != 0 { + /// if copy_length == 4096 { + /// return true; + /// } + /// if copy_length % 16 == 0 { + /// return false; + /// } + /// } + /// } + /// true + /// } + /// ``` + /// + /// As such, we need to make sure all calls to vkCmdFillBuffer are aligned to 16 bytes + /// if they cover a range of 4096 bytes or more. + const FORCE_FILL_BUFFER_WITH_SIZE_GREATER_4096_ALIGNED_OFFSET_16 = 0x4; } );