diff --git a/crates/bevy_render/src/gpu_component_array_buffer.rs b/crates/bevy_render/src/gpu_component_array_buffer.rs new file mode 100644 index 0000000000000..6076049c7fd2c --- /dev/null +++ b/crates/bevy_render/src/gpu_component_array_buffer.rs @@ -0,0 +1,55 @@ +use crate::{ + render_resource::{GpuArrayBuffer, GpuArrayBufferable}, + renderer::{RenderDevice, RenderQueue}, + Render, RenderApp, RenderSet, +}; +use bevy_app::{App, Plugin}; +use bevy_ecs::{ + prelude::{Component, Entity}, + schedule::IntoSystemConfigs, + system::{Commands, Query, Res, ResMut}, +}; +use std::marker::PhantomData; + +/// This plugin prepares the components of the corresponding type for the GPU +/// by storing them in a [`GpuArrayBuffer`]. +pub struct GpuComponentArrayBufferPlugin(PhantomData); + +impl Plugin for GpuComponentArrayBufferPlugin { + fn build(&self, app: &mut App) { + if let Ok(render_app) = app.get_sub_app_mut(RenderApp) { + render_app + .insert_resource(GpuArrayBuffer::::new( + render_app.world.resource::(), + )) + .add_systems( + Render, + prepare_gpu_component_array_buffers::.in_set(RenderSet::Prepare), + ); + } + } +} + +impl Default for GpuComponentArrayBufferPlugin { + fn default() -> Self { + Self(PhantomData::) + } +} + +fn prepare_gpu_component_array_buffers( + mut commands: Commands, + render_device: Res, + render_queue: Res, + mut gpu_array_buffer: ResMut>, + components: Query<(Entity, &C)>, +) { + gpu_array_buffer.clear(); + + let entities = components + .iter() + .map(|(entity, component)| (entity, gpu_array_buffer.push(component.clone()))) + .collect::>(); + commands.insert_or_spawn_batch(entities); + + gpu_array_buffer.write_buffer(&render_device, &render_queue); +} diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs index cb625bf49d9db..06b77ac9dba18 100644 --- a/crates/bevy_render/src/lib.rs +++ b/crates/bevy_render/src/lib.rs @@ -11,6 +11,7 @@ pub mod extract_component; mod extract_param; pub mod extract_resource; pub mod globals; +pub mod gpu_component_array_buffer; pub mod mesh; pub mod pipelined_rendering; pub mod primitives; diff --git a/crates/bevy_render/src/render_resource/batched_uniform_buffer.rs b/crates/bevy_render/src/render_resource/batched_uniform_buffer.rs new file mode 100644 index 0000000000000..a9fba2ac7fb42 --- /dev/null +++ b/crates/bevy_render/src/render_resource/batched_uniform_buffer.rs @@ -0,0 +1,152 @@ +use super::{GpuArrayBufferIndex, GpuArrayBufferable}; +use crate::{ + render_resource::DynamicUniformBuffer, + renderer::{RenderDevice, RenderQueue}, +}; +use encase::{ + private::{ArrayMetadata, BufferMut, Metadata, RuntimeSizedArray, WriteInto, Writer}, + ShaderType, +}; +use std::{marker::PhantomData, num::NonZeroU64}; +use wgpu::{BindingResource, Limits}; + +// 1MB else we will make really large arrays on macOS which reports very large +// `max_uniform_buffer_binding_size`. On macOS this ends up being the minimum +// size of the uniform buffer as well as the size of each chunk of data at a +// dynamic offset. +#[cfg(any(not(feature = "webgl"), not(target_arch = "wasm32")))] +const MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE: u32 = 1 << 20; + +// WebGL2 quirk: using uniform buffers larger than 4KB will cause extremely +// long shader compilation times, so the limit needs to be lower on WebGL2. +// This is due to older shader compilers/GPUs that don't support dynamically +// indexing uniform buffers, and instead emulate it with large switch statements +// over buffer indices that take a long time to compile. +#[cfg(all(feature = "webgl", target_arch = "wasm32"))] +const MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE: u32 = 1 << 12; + +/// Similar to [`DynamicUniformBuffer`], except every N elements (depending on size) +/// are grouped into a batch as an `array` in WGSL. +/// +/// This reduces the number of rebindings required due to having to pass dynamic +/// offsets to bind group commands, and if indices into the array can be passed +/// in via other means, it enables batching of draw commands. +pub struct BatchedUniformBuffer { + // Batches of fixed-size arrays of T are written to this buffer so that + // each batch in a fixed-size array can be bound at a dynamic offset. + uniforms: DynamicUniformBuffer>>, + // A batch of T are gathered into this `MaxCapacityArray` until it is full, + // then it is written into the `DynamicUniformBuffer`, cleared, and new T + // are gathered here, and so on for each batch. + temp: MaxCapacityArray>, + current_offset: u32, + dynamic_offset_alignment: u32, +} + +impl BatchedUniformBuffer { + pub fn batch_size(limits: &Limits) -> usize { + (limits + .max_uniform_buffer_binding_size + .min(MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE) as u64 + / T::min_size().get()) as usize + } + + pub fn new(limits: &Limits) -> Self { + let capacity = Self::batch_size(limits); + let alignment = limits.min_uniform_buffer_offset_alignment; + + Self { + uniforms: DynamicUniformBuffer::new_with_alignment(alignment as u64), + temp: MaxCapacityArray(Vec::with_capacity(capacity), capacity), + current_offset: 0, + dynamic_offset_alignment: alignment, + } + } + + #[inline] + pub fn size(&self) -> NonZeroU64 { + self.temp.size() + } + + pub fn clear(&mut self) { + self.uniforms.clear(); + self.current_offset = 0; + self.temp.0.clear(); + } + + pub fn push(&mut self, component: T) -> GpuArrayBufferIndex { + let result = GpuArrayBufferIndex { + index: self.temp.0.len() as u32, + dynamic_offset: Some(self.current_offset), + element_type: PhantomData, + }; + self.temp.0.push(component); + if self.temp.0.len() == self.temp.1 { + self.flush(); + } + result + } + + pub fn flush(&mut self) { + self.uniforms.push(self.temp.clone()); + + self.current_offset += + align_to_next(self.temp.size().get(), self.dynamic_offset_alignment as u64) as u32; + + self.temp.0.clear(); + } + + pub fn write_buffer(&mut self, device: &RenderDevice, queue: &RenderQueue) { + if !self.temp.0.is_empty() { + self.flush(); + } + self.uniforms.write_buffer(device, queue); + } + + #[inline] + pub fn binding(&self) -> Option { + let mut binding = self.uniforms.binding(); + if let Some(BindingResource::Buffer(binding)) = &mut binding { + // MaxCapacityArray is runtime-sized so can't use T::min_size() + binding.size = Some(self.size()); + } + binding + } +} + +#[inline] +fn align_to_next(value: u64, alignment: u64) -> u64 { + debug_assert!(alignment & (alignment - 1) == 0); + ((value - 1) | (alignment - 1)) + 1 +} + +// ---------------------------------------------------------------------------- +// MaxCapacityArray was implemented by Teodor Tanasoaia for encase. It was +// copied here as it was not yet included in an encase release and it is +// unclear if it is the correct long-term solution for encase. + +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord)] +struct MaxCapacityArray(T, usize); + +impl ShaderType for MaxCapacityArray +where + T: ShaderType, +{ + type ExtraMetadata = ArrayMetadata; + + const METADATA: Metadata = T::METADATA; + + fn size(&self) -> ::core::num::NonZeroU64 { + Self::METADATA.stride().mul(self.1.max(1) as u64).0 + } +} + +impl WriteInto for MaxCapacityArray +where + T: WriteInto + RuntimeSizedArray, +{ + fn write_into(&self, writer: &mut Writer) { + debug_assert!(self.0.len() <= self.1); + self.0.write_into(writer); + } +} diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs index 07440a27e9387..002e8f8bd3309 100644 --- a/crates/bevy_render/src/render_resource/buffer_vec.rs +++ b/crates/bevy_render/src/render_resource/buffer_vec.rs @@ -21,9 +21,11 @@ use wgpu::BufferUsages; /// from system RAM to VRAM. /// /// Other options for storing GPU-accessible data are: +/// * [`StorageBuffer`](crate::render_resource::StorageBuffer) /// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer) /// * [`UniformBuffer`](crate::render_resource::UniformBuffer) /// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) +/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer) /// * [`BufferVec`](crate::render_resource::BufferVec) /// * [`Texture`](crate::render_resource::Texture) pub struct BufferVec { diff --git a/crates/bevy_render/src/render_resource/gpu_array_buffer.rs b/crates/bevy_render/src/render_resource/gpu_array_buffer.rs new file mode 100644 index 0000000000000..45eaba4f73246 --- /dev/null +++ b/crates/bevy_render/src/render_resource/gpu_array_buffer.rs @@ -0,0 +1,129 @@ +use super::StorageBuffer; +use crate::{ + render_resource::batched_uniform_buffer::BatchedUniformBuffer, + renderer::{RenderDevice, RenderQueue}, +}; +use bevy_ecs::{prelude::Component, system::Resource}; +use encase::{private::WriteInto, ShaderSize, ShaderType}; +use std::{marker::PhantomData, mem}; +use wgpu::{BindGroupLayoutEntry, BindingResource, BindingType, BufferBindingType, ShaderStages}; + +/// Trait for types able to go in a [`GpuArrayBuffer`]. +pub trait GpuArrayBufferable: ShaderType + ShaderSize + WriteInto + Clone {} +impl GpuArrayBufferable for T {} + +/// Stores an array of elements to be transferred to the GPU and made accessible to shaders as a read-only array. +/// +/// On platforms that support storage buffers, this is equivalent to [`StorageBuffer>`]. +/// Otherwise, this falls back to a dynamic offset uniform buffer with the largest +/// array of T that fits within a uniform buffer binding (within reasonable limits). +/// +/// Other options for storing GPU-accessible data are: +/// * [`StorageBuffer`](crate::render_resource::StorageBuffer) +/// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer) +/// * [`UniformBuffer`](crate::render_resource::UniformBuffer) +/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) +/// * [`BufferVec`](crate::render_resource::BufferVec) +/// * [`Texture`](crate::render_resource::Texture) +#[derive(Resource)] +pub enum GpuArrayBuffer { + Uniform(BatchedUniformBuffer), + Storage((StorageBuffer>, Vec)), +} + +impl GpuArrayBuffer { + pub fn new(device: &RenderDevice) -> Self { + let limits = device.limits(); + if limits.max_storage_buffers_per_shader_stage == 0 { + GpuArrayBuffer::Uniform(BatchedUniformBuffer::new(&limits)) + } else { + GpuArrayBuffer::Storage((StorageBuffer::default(), Vec::new())) + } + } + + pub fn clear(&mut self) { + match self { + GpuArrayBuffer::Uniform(buffer) => buffer.clear(), + GpuArrayBuffer::Storage((_, buffer)) => buffer.clear(), + } + } + + pub fn push(&mut self, value: T) -> GpuArrayBufferIndex { + match self { + GpuArrayBuffer::Uniform(buffer) => buffer.push(value), + GpuArrayBuffer::Storage((_, buffer)) => { + let index = buffer.len() as u32; + buffer.push(value); + GpuArrayBufferIndex { + index, + dynamic_offset: None, + element_type: PhantomData, + } + } + } + } + + pub fn write_buffer(&mut self, device: &RenderDevice, queue: &RenderQueue) { + match self { + GpuArrayBuffer::Uniform(buffer) => buffer.write_buffer(device, queue), + GpuArrayBuffer::Storage((buffer, vec)) => { + buffer.set(mem::take(vec)); + buffer.write_buffer(device, queue); + } + } + } + + pub fn binding_layout( + binding: u32, + visibility: ShaderStages, + device: &RenderDevice, + ) -> BindGroupLayoutEntry { + BindGroupLayoutEntry { + binding, + visibility, + ty: if device.limits().max_storage_buffers_per_shader_stage == 0 { + BindingType::Buffer { + ty: BufferBindingType::Uniform, + has_dynamic_offset: true, + // BatchedUniformBuffer uses a MaxCapacityArray that is runtime-sized, so we use + // None here and let wgpu figure out the size. + min_binding_size: None, + } + } else { + BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: Some(T::min_size()), + } + }, + count: None, + } + } + + pub fn binding(&self) -> Option { + match self { + GpuArrayBuffer::Uniform(buffer) => buffer.binding(), + GpuArrayBuffer::Storage((buffer, _)) => buffer.binding(), + } + } + + pub fn batch_size(device: &RenderDevice) -> Option { + let limits = device.limits(); + if limits.max_storage_buffers_per_shader_stage == 0 { + Some(BatchedUniformBuffer::::batch_size(&limits) as u32) + } else { + None + } + } +} + +/// An index into a [`GpuArrayBuffer`] for a given element. +#[derive(Component)] +pub struct GpuArrayBufferIndex { + /// The index to use in a shader into the array. + pub index: u32, + /// The dynamic offset to use when setting the bind group in a pass. + /// Only used on platforms that don't support storage buffers. + pub dynamic_offset: Option, + pub element_type: PhantomData, +} diff --git a/crates/bevy_render/src/render_resource/mod.rs b/crates/bevy_render/src/render_resource/mod.rs index 91440cf55c276..f16f5f1269929 100644 --- a/crates/bevy_render/src/render_resource/mod.rs +++ b/crates/bevy_render/src/render_resource/mod.rs @@ -1,7 +1,9 @@ +mod batched_uniform_buffer; mod bind_group; mod bind_group_layout; mod buffer; mod buffer_vec; +mod gpu_array_buffer; mod pipeline; mod pipeline_cache; mod pipeline_specializer; @@ -15,6 +17,7 @@ pub use bind_group::*; pub use bind_group_layout::*; pub use buffer::*; pub use buffer_vec::*; +pub use gpu_array_buffer::*; pub use pipeline::*; pub use pipeline_cache::*; pub use pipeline_specializer::*; diff --git a/crates/bevy_render/src/render_resource/storage_buffer.rs b/crates/bevy_render/src/render_resource/storage_buffer.rs index 26d3797eded5c..2c73b322d79b5 100644 --- a/crates/bevy_render/src/render_resource/storage_buffer.rs +++ b/crates/bevy_render/src/render_resource/storage_buffer.rs @@ -25,6 +25,7 @@ use wgpu::{util::BufferInitDescriptor, BindingResource, BufferBinding, BufferUsa /// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer) /// * [`UniformBuffer`](crate::render_resource::UniformBuffer) /// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) +/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer) /// * [`BufferVec`](crate::render_resource::BufferVec) /// * [`Texture`](crate::render_resource::Texture) /// @@ -154,6 +155,7 @@ impl StorageBuffer { /// * [`StorageBuffer`](crate::render_resource::StorageBuffer) /// * [`UniformBuffer`](crate::render_resource::UniformBuffer) /// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) +/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer) /// * [`BufferVec`](crate::render_resource::BufferVec) /// * [`Texture`](crate::render_resource::Texture) /// diff --git a/crates/bevy_render/src/render_resource/uniform_buffer.rs b/crates/bevy_render/src/render_resource/uniform_buffer.rs index 137432c062bfb..4c1ad61b2aeb8 100644 --- a/crates/bevy_render/src/render_resource/uniform_buffer.rs +++ b/crates/bevy_render/src/render_resource/uniform_buffer.rs @@ -22,9 +22,10 @@ use wgpu::{util::BufferInitDescriptor, BindingResource, BufferBinding, BufferUsa /// (vectors), or structures with fields that are vectors. /// /// Other options for storing GPU-accessible data are: -/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) /// * [`StorageBuffer`](crate::render_resource::StorageBuffer) /// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer) +/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) +/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer) /// * [`BufferVec`](crate::render_resource::BufferVec) /// * [`Texture`](crate::render_resource::Texture) /// @@ -151,6 +152,8 @@ impl UniformBuffer { /// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer) /// * [`UniformBuffer`](crate::render_resource::UniformBuffer) /// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer) +/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer) +/// * [`BufferVec`](crate::render_resource::BufferVec) /// * [`Texture`](crate::render_resource::Texture) /// /// [std140 alignment/padding requirements]: https://www.w3.org/TR/WGSL/#address-spaces-uniform @@ -177,6 +180,17 @@ impl Default for DynamicUniformBuffer { } impl DynamicUniformBuffer { + pub fn new_with_alignment(alignment: u64) -> Self { + Self { + scratch: DynamicUniformBufferWrapper::new_with_alignment(Vec::new(), alignment), + buffer: None, + label: None, + changed: false, + buffer_usage: BufferUsages::COPY_DST | BufferUsages::UNIFORM, + _marker: PhantomData, + } + } + #[inline] pub fn buffer(&self) -> Option<&Buffer> { self.buffer.as_ref()