From c1f9764e4c91bab1c1e886f8516753cc415fd59e Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sat, 8 Feb 2025 19:31:39 -0800
Subject: [PATCH 1/4] Build batches across phases in parallel.

Currently, invocations of `batch_and_prepare_binned_render_phase` and
`batch_and_prepare_sorted_render_phase` can't run in parallel because
they write to scene-global GPU buffers. After PR #17698,
`batch_and_prepare_binned_render_phase` started accounting for the
lion's share of the CPU time, causing us to be strongly CPU bound on
scenes like Caldera when occlusion culling was on (because of the
overhead of batching for the Z-prepass). Although I eventually plan to
optimize `batch_and_prepare_binned_render_phase`, we can obtain
significant wins now by parallelizing that system across phases.

This commit splits all GPU buffers that
`batch_and_prepare_binned_render_phase` and
`batch_and_prepare_sorted_render_phase` touches into separate buffers
for each phase so that the scheduler will run those phases in parallel.
At the end of batch preparation, we gather the render phases up into a
single resource with a new *collection* phase. Because we already run
mesh preprocessing separately for each phase in order to make occlusion
culling work, this is actually a cleaner separation. For example, mesh
output indices (the unique ID that identifies each mesh instance on GPU)
are now guaranteed to be sequential starting from 0, which will simplify
the forthcoming work to remove them in favor of the compute dispatch ID.

On Caldera, this brings the frame time down to approximately 9.1 ms with
occlusion culling on.
---
 crates/bevy_pbr/src/decal/forward.rs          |   1 +
 crates/bevy_pbr/src/lib.rs                    |  11 +
 crates/bevy_pbr/src/material.rs               |  11 +-
 crates/bevy_pbr/src/prepass/mod.rs            |  27 +-
 crates/bevy_pbr/src/render/gpu_preprocess.rs  | 490 ++++++++++--------
 crates/bevy_pbr/src/render/mesh.rs            | 240 +++++++--
 .../src/batching/gpu_preprocessing.rs         | 445 ++++++++++++----
 crates/bevy_render/src/batching/mod.rs        |   4 +-
 crates/bevy_render/src/lib.rs                 |  10 +-
 crates/bevy_render/src/render_phase/mod.rs    |  92 +++-
 crates/bevy_sprite/src/mesh2d/mesh.rs         |   2 +-
 examples/3d/occlusion_culling.rs              |  35 +-
 examples/shader/specialized_mesh_pipeline.rs  |  25 +-
 13 files changed, 968 insertions(+), 425 deletions(-)
diff --git a/crates/bevy_pbr/src/decal/forward.rs b/crates/bevy_pbr/src/decal/forward.rs
index 1229a688a9cf7..4771ff1a5dac6 100644
--- a/crates/bevy_pbr/src/decal/forward.rs
+++ b/crates/bevy_pbr/src/decal/forward.rs
@@ -48,6 +48,7 @@ impl Plugin for ForwardDecalPlugin {
         app.add_plugins(MaterialPlugin::<ForwardDecalMaterial<StandardMaterial>> {
             prepass_enabled: false,
             shadows_enabled: false,
+            allow_copies_from_indirect_parameters: false,
             ..Default::default()
         });
     }
diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 88403900aeedb..787704c10282a 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -182,6 +182,12 @@ pub struct PbrPlugin {
     /// This requires compute shader support and so will be forcibly disabled if
     /// the platform doesn't support those.
     pub use_gpu_instance_buffer_builder: bool,
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameters: bool,
 }
 
 impl Default for PbrPlugin {
@@ -190,6 +196,7 @@ impl Default for PbrPlugin {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
             use_gpu_instance_buffer_builder: true,
+            allow_copies_from_indirect_parameters: false,
         }
     }
 }
@@ -333,9 +340,13 @@ impl Plugin for PbrPlugin {
             .add_plugins((
                 MeshRenderPlugin {
                     use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
+                    allow_copies_from_indirect_parameters: self
+                        .allow_copies_from_indirect_parameters,
                 },
                 MaterialPlugin::<StandardMaterial> {
                     prepass_enabled: self.prepass_enabled,
+                    allow_copies_from_indirect_parameters: self
+                        .allow_copies_from_indirect_parameters,
                     ..Default::default()
                 },
                 ScreenSpaceAmbientOcclusionPlugin,
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 32cc445d4268d..15eb14c8c672d 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -252,6 +252,12 @@ pub struct MaterialPlugin<M: Material> {
     pub prepass_enabled: bool,
     /// Controls if shadows are enabled for the Material.
     pub shadows_enabled: bool,
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameters: bool,
     pub _marker: PhantomData<M>,
 }
 
@@ -260,6 +266,7 @@ impl<M: Material> Default for MaterialPlugin<M> {
         Self {
             prepass_enabled: true,
             shadows_enabled: true,
+            allow_copies_from_indirect_parameters: false,
             _marker: Default::default(),
         }
     }
@@ -374,7 +381,9 @@ where
         }
 
         if self.prepass_enabled {
-            app.add_plugins(PrepassPlugin::<M>::default());
+            app.add_plugins(PrepassPlugin::<M>::new(
+                self.allow_copies_from_indirect_parameters,
+            ));
         }
     }
 
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index d1cb99e502e52..665292e60583f 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -146,11 +146,22 @@ where
 /// Sets up the prepasses for a [`Material`].
 ///
 /// This depends on the [`PrepassPipelinePlugin`].
-pub struct PrepassPlugin<M: Material>(PhantomData<M>);
+pub struct PrepassPlugin<M: Material> {
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameters: bool,
+    pub phantom: PhantomData<M>,
+}
 
-impl<M: Material> Default for PrepassPlugin<M> {
-    fn default() -> Self {
-        Self(Default::default())
+impl<M: Material> PrepassPlugin<M> {
+    pub fn new(allow_copies_from_indirect_parameters: bool) -> Self {
+        PrepassPlugin {
+            allow_copies_from_indirect_parameters,
+            phantom: PhantomData,
+        }
     }
 }
 
@@ -176,8 +187,12 @@ where
                     ),
                 )
                 .add_plugins((
-                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::default(),
-                    BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::default(),
+                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::new(
+                        self.allow_copies_from_indirect_parameters,
+                    ),
+                    BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::new(
+                        self.allow_copies_from_indirect_parameters,
+                    ),
                 ));
         }
 
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 26559f9223dd5..58ef93d906c8b 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -29,12 +29,14 @@ use bevy_ecs::{
     system::{lifetimeless::Read, Commands, Query, Res, ResMut},
     world::{FromWorld, World},
 };
+use bevy_render::batching::gpu_preprocessing::UntypedPhaseIndirectParametersBuffers;
 use bevy_render::{
     batching::gpu_preprocessing::{
         BatchedInstanceBuffers, GpuOcclusionCullingWorkItemBuffers, GpuPreprocessingSupport,
         IndirectBatchSet, IndirectParametersBuffers, IndirectParametersIndexed,
         IndirectParametersMetadata, IndirectParametersNonIndexed,
         LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, PreprocessWorkItemBuffers,
+        UntypedPhaseBatchedInstanceBuffers,
     },
     experimental::occlusion_culling::OcclusionCulling,
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
@@ -393,8 +395,22 @@ pub enum PhasePreprocessBindGroups {
 
 /// The bind groups for the compute shaders that reset indirect draw counts and
 /// build indirect parameters.
-#[derive(Resource)]
-pub struct BuildIndirectParametersBindGroups {
+///
+/// There's one set of bind group for each phase. Phases are keyed off their
+/// [`core::any::TypeId`].
+#[derive(Resource, Default, Deref, DerefMut)]
+pub struct BuildIndirectParametersBindGroups(pub TypeIdMap<PhaseBuildIndirectParametersBindGroups>);
+
+impl BuildIndirectParametersBindGroups {
+    /// Creates a new, empty [`BuildIndirectParametersBindGroups`] table.
+    pub fn new() -> BuildIndirectParametersBindGroups {
+        Self::default()
+    }
+}
+
+/// The per-phase set of bind groups for the compute shaders that reset indirect
+/// draw counts and build indirect parameters.
+pub struct PhaseBuildIndirectParametersBindGroups {
     /// The bind group for the `reset_indirect_batch_sets.wgsl` shader, for
     /// indexed meshes.
     reset_indexed_indirect_batch_sets: Option<BindGroup>,
@@ -470,9 +486,10 @@ impl Plugin for GpuMeshPreprocessPlugin {
                 (
                     prepare_preprocess_pipelines.in_set(RenderSet::Prepare),
                     prepare_preprocess_bind_groups
-                        .run_if(
-                            resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
-                        )
+                        .run_if(resource_exists::<BatchedInstanceBuffers<
+                            MeshUniform,
+                            MeshInputUniform
+                        >>)
                         .in_set(RenderSet::PrepareBindGroups),
                     write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush),
                 ),
@@ -511,7 +528,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
             .add_render_graph_edge(
                 Core3d,
                 NodePbr::MainBuildIndirectParameters,
-                Node3d::DeferredPrepass
+                Node3d::DeferredPrepass,
             );
     }
 }
@@ -538,10 +555,8 @@ impl Node for EarlyGpuPreprocessNode {
         world: &'w World,
     ) -> Result<(), NodeRunError> {
         // Grab the [`BatchedInstanceBuffers`].
-        let BatchedInstanceBuffers {
-            work_item_buffers: ref index_buffers,
-            ..
-        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
+        let batched_instance_buffers =
+            world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
 
         let pipeline_cache = world.resource::<PipelineCache>();
         let preprocess_pipelines = world.resource::<PreprocessPipelines>();
@@ -583,13 +598,6 @@ impl Node for EarlyGpuPreprocessNode {
                 continue;
             };
 
-            // Grab the work item buffers for this view.
-            let Some(phase_work_item_buffers) = index_buffers.get(&view.retained_view_entity)
-            else {
-                warn!("The preprocessing index buffer wasn't present");
-                continue;
-            };
-
             // Select the right pipeline, depending on whether GPU culling is in
             // use.
             let maybe_pipeline_id = if no_indirect_drawing {
@@ -620,7 +628,17 @@ impl Node for EarlyGpuPreprocessNode {
             compute_pass.set_pipeline(preprocess_pipeline);
 
             // Loop over each render phase.
-            for (phase_type_id, work_item_buffers) in phase_work_item_buffers {
+            for (phase_type_id, batched_phase_instance_buffers) in
+                &batched_instance_buffers.phase_instance_buffers
+            {
+                // Grab the work item buffers for this view.
+                let Some(work_item_buffers) = batched_phase_instance_buffers
+                    .work_item_buffers
+                    .get(&view.retained_view_entity)
+                else {
+                    continue;
+                };
+
                 // Fetch the bind group for the render phase.
                 let Some(phase_bind_groups) = bind_groups.get(phase_type_id) else {
                     continue;
@@ -775,12 +793,8 @@ impl Node for LateGpuPreprocessNode {
         world: &'w World,
     ) -> Result<(), NodeRunError> {
         // Grab the [`BatchedInstanceBuffers`].
-        let BatchedInstanceBuffers {
-            ref work_item_buffers,
-            ref late_indexed_indirect_parameters_buffer,
-            ref late_non_indexed_indirect_parameters_buffer,
-            ..
-        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
+        let batched_instance_buffers =
+            world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
 
         let pipeline_cache = world.resource::<PipelineCache>();
         let preprocess_pipelines = world.resource::<PreprocessPipelines>();
@@ -795,13 +809,6 @@ impl Node for LateGpuPreprocessNode {
 
         // Run the compute passes.
         for (view, bind_groups, view_uniform_offset) in self.view_query.iter_manual(world) {
-            // Grab the work item buffers for this view.
-            let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
-            else {
-                warn!("The preprocessing index buffer wasn't present");
-                continue;
-            };
-
             let maybe_pipeline_id = preprocess_pipelines
                 .late_gpu_occlusion_culling_preprocess
                 .pipeline_id;
@@ -821,7 +828,25 @@ impl Node for LateGpuPreprocessNode {
 
             compute_pass.set_pipeline(preprocess_pipeline);
 
-            for (phase_type_id, work_item_buffers) in phase_work_item_buffers {
+            // Loop over each phase. Because we built the phases in parallel,
+            // each phase has a separate set of instance buffers.
+            for (phase_type_id, batched_phase_instance_buffers) in
+                &batched_instance_buffers.phase_instance_buffers
+            {
+                let UntypedPhaseBatchedInstanceBuffers {
+                    ref work_item_buffers,
+                    ref late_indexed_indirect_parameters_buffer,
+                    ref late_non_indexed_indirect_parameters_buffer,
+                    ..
+                } = *batched_phase_instance_buffers;
+
+                // Grab the work item buffers for this view.
+                let Some(phase_work_item_buffers) =
+                    work_item_buffers.get(&view.retained_view_entity)
+                else {
+                    continue;
+                };
+
                 let (
                     PreprocessWorkItemBuffers::Indirect {
                         gpu_occlusion_culling:
@@ -840,7 +865,7 @@ impl Node for LateGpuPreprocessNode {
                     Some(late_indexed_indirect_parameters_buffer),
                     Some(late_non_indexed_indirect_parameters_buffer),
                 ) = (
-                    work_item_buffers,
+                    phase_work_item_buffers,
                     bind_groups.get(phase_type_id),
                     late_indexed_indirect_parameters_buffer.buffer(),
                     late_non_indexed_indirect_parameters_buffer.buffer(),
@@ -1029,57 +1054,69 @@ fn run_build_indirect_parameters_node(
         return Ok(());
     };
 
-    // Build indexed indirect parameters.
-    if let (
-        Some(reset_indexed_indirect_batch_sets_bind_group),
-        Some(build_indirect_indexed_params_bind_group),
-    ) = (
-        &build_indirect_params_bind_groups.reset_indexed_indirect_batch_sets,
-        &build_indirect_params_bind_groups.build_indexed_indirect,
-    ) {
-        compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
-        compute_pass.set_bind_group(0, reset_indexed_indirect_batch_sets_bind_group, &[]);
-        let workgroup_count = indirect_parameters_buffers
-            .batch_set_count(true)
-            .div_ceil(WORKGROUP_SIZE);
-        if workgroup_count > 0 {
-            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
-        }
+    // Loop over each phase. As each has as separate set of buffers, we need to
+    // build indirect parameters individually for each phase.
+    for (phase_type_id, phase_build_indirect_params_bind_groups) in
+        build_indirect_params_bind_groups.iter()
+    {
+        let Some(phase_indirect_parameters_buffers) =
+            indirect_parameters_buffers.buffers.get(phase_type_id)
+        else {
+            continue;
+        };
 
-        compute_pass.set_pipeline(build_indexed_indirect_params_pipeline);
-        compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]);
-        let workgroup_count = indirect_parameters_buffers
-            .indexed_batch_count()
-            .div_ceil(WORKGROUP_SIZE);
-        if workgroup_count > 0 {
-            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
-        }
-    }
+        // Build indexed indirect parameters.
+        if let (
+            Some(reset_indexed_indirect_batch_sets_bind_group),
+            Some(build_indirect_indexed_params_bind_group),
+        ) = (
+            &phase_build_indirect_params_bind_groups.reset_indexed_indirect_batch_sets,
+            &phase_build_indirect_params_bind_groups.build_indexed_indirect,
+        ) {
+            compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
+            compute_pass.set_bind_group(0, reset_indexed_indirect_batch_sets_bind_group, &[]);
+            let workgroup_count = phase_indirect_parameters_buffers
+                .batch_set_count(true)
+                .div_ceil(WORKGROUP_SIZE);
+            if workgroup_count > 0 {
+                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            }
 
-    // Build non-indexed indirect parameters.
-    if let (
-        Some(reset_non_indexed_indirect_batch_sets_bind_group),
-        Some(build_indirect_non_indexed_params_bind_group),
-    ) = (
-        &build_indirect_params_bind_groups.reset_non_indexed_indirect_batch_sets,
-        &build_indirect_params_bind_groups.build_non_indexed_indirect,
-    ) {
-        compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
-        compute_pass.set_bind_group(0, reset_non_indexed_indirect_batch_sets_bind_group, &[]);
-        let workgroup_count = indirect_parameters_buffers
-            .batch_set_count(false)
-            .div_ceil(WORKGROUP_SIZE);
-        if workgroup_count > 0 {
-            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            compute_pass.set_pipeline(build_indexed_indirect_params_pipeline);
+            compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]);
+            let workgroup_count = phase_indirect_parameters_buffers
+                .indexed_batch_count()
+                .div_ceil(WORKGROUP_SIZE);
+            if workgroup_count > 0 {
+                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            }
         }
 
-        compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline);
-        compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]);
-        let workgroup_count = indirect_parameters_buffers
-            .non_indexed_batch_count()
-            .div_ceil(WORKGROUP_SIZE);
-        if workgroup_count > 0 {
-            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+        // Build non-indexed indirect parameters.
+        if let (
+            Some(reset_non_indexed_indirect_batch_sets_bind_group),
+            Some(build_indirect_non_indexed_params_bind_group),
+        ) = (
+            &phase_build_indirect_params_bind_groups.reset_non_indexed_indirect_batch_sets,
+            &phase_build_indirect_params_bind_groups.build_non_indexed_indirect,
+        ) {
+            compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
+            compute_pass.set_bind_group(0, reset_non_indexed_indirect_batch_sets_bind_group, &[]);
+            let workgroup_count = phase_indirect_parameters_buffers
+                .batch_set_count(false)
+                .div_ceil(WORKGROUP_SIZE);
+            if workgroup_count > 0 {
+                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            }
+
+            compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline);
+            compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]);
+            let workgroup_count = phase_indirect_parameters_buffers
+                .non_indexed_batch_count()
+                .div_ceil(WORKGROUP_SIZE);
+            if workgroup_count > 0 {
+                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            }
         }
     }
 
@@ -1637,18 +1674,14 @@ pub fn prepare_preprocess_bind_groups(
 ) {
     // Grab the `BatchedInstanceBuffers`.
     let BatchedInstanceBuffers {
-        data_buffer: ref data_buffer_vec,
-        ref work_item_buffers,
         current_input_buffer: ref current_input_buffer_vec,
         previous_input_buffer: ref previous_input_buffer_vec,
-        ref late_indexed_indirect_parameters_buffer,
-        ref late_non_indexed_indirect_parameters_buffer,
+        ref phase_instance_buffers,
     } = batched_instance_buffers.into_inner();
 
-    let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = (
+    let (Some(current_input_buffer), Some(previous_input_buffer)) = (
         current_input_buffer_vec.buffer().buffer(),
         previous_input_buffer_vec.buffer().buffer(),
-        data_buffer_vec.buffer(),
     ) else {
         return;
     };
@@ -1659,22 +1692,39 @@ pub fn prepare_preprocess_bind_groups(
 
     // Loop over each view.
     for (view_entity, view) in &views {
-        let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
-        else {
-            continue;
-        };
-
         let mut bind_groups = TypeIdMap::default();
 
         // Loop over each phase.
-        for (&phase_id, work_item_buffers) in phase_work_item_buffers {
+        for (phase_type_id, phase_instance_buffers) in phase_instance_buffers {
+            let UntypedPhaseBatchedInstanceBuffers {
+                data_buffer: ref data_buffer_vec,
+                ref work_item_buffers,
+                ref late_indexed_indirect_parameters_buffer,
+                ref late_non_indexed_indirect_parameters_buffer,
+            } = *phase_instance_buffers;
+
+            let Some(data_buffer) = data_buffer_vec.buffer() else {
+                continue;
+            };
+
+            // Grab the indirect parameters buffers for this phase.
+            let Some(phase_indirect_parameters_buffers) =
+                indirect_parameters_buffers.buffers.get(phase_type_id)
+            else {
+                continue;
+            };
+
+            let Some(work_item_buffers) = work_item_buffers.get(&view.retained_view_entity) else {
+                continue;
+            };
+
             // Create the `PreprocessBindGroupBuilder`.
             let preprocess_bind_group_builder = PreprocessBindGroupBuilder {
                 view: view_entity,
                 late_indexed_indirect_parameters_buffer,
                 late_non_indexed_indirect_parameters_buffer,
                 render_device: &render_device,
-                indirect_parameters_buffers: &indirect_parameters_buffers,
+                phase_indirect_parameters_buffers,
                 mesh_culling_data_buffer: &mesh_culling_data_buffer,
                 view_uniforms: &view_uniforms,
                 previous_view_uniforms: &previous_view_uniforms,
@@ -1725,7 +1775,7 @@ pub fn prepare_preprocess_bind_groups(
             // Write that bind group in.
             if let Some(bind_group) = bind_group {
                 any_indirect = any_indirect || was_indirect;
-                bind_groups.insert(phase_id, bind_group);
+                bind_groups.insert(*phase_type_id, bind_group);
             }
         }
 
@@ -1764,7 +1814,7 @@ struct PreprocessBindGroupBuilder<'a> {
     /// The device.
     render_device: &'a RenderDevice,
     /// The buffers that store indirect draw parameters.
-    indirect_parameters_buffers: &'a IndirectParametersBuffers,
+    phase_indirect_parameters_buffers: &'a UntypedPhaseIndirectParametersBuffers,
     /// The GPU buffer that stores the information needed to cull each mesh.
     mesh_culling_data_buffer: &'a MeshCullingDataBuffer,
     /// The GPU buffer that stores information about the view.
@@ -1884,7 +1934,8 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
 
         match (
-            self.indirect_parameters_buffers.indexed_metadata_buffer(),
+            self.phase_indirect_parameters_buffers
+                .indexed_metadata_buffer(),
             indexed_work_item_buffer.buffer(),
             late_indexed_work_item_buffer.buffer(),
             self.late_indexed_indirect_parameters_buffer.buffer(),
@@ -1975,7 +2026,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
 
         match (
-            self.indirect_parameters_buffers
+            self.phase_indirect_parameters_buffers
                 .non_indexed_metadata_buffer(),
             non_indexed_work_item_buffer.buffer(),
             late_non_indexed_work_item_buffer.buffer(),
@@ -2066,7 +2117,8 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
 
         match (
-            self.indirect_parameters_buffers.indexed_metadata_buffer(),
+            self.phase_indirect_parameters_buffers
+                .indexed_metadata_buffer(),
             late_indexed_work_item_buffer.buffer(),
             self.late_indexed_indirect_parameters_buffer.buffer(),
         ) {
@@ -2146,7 +2198,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
 
         match (
-            self.indirect_parameters_buffers
+            self.phase_indirect_parameters_buffers
                 .non_indexed_metadata_buffer(),
             late_non_indexed_work_item_buffer.buffer(),
             self.late_non_indexed_indirect_parameters_buffer.buffer(),
@@ -2240,7 +2292,8 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         let view_uniforms_binding = self.view_uniforms.uniforms.binding()?;
 
         match (
-            self.indirect_parameters_buffers.indexed_metadata_buffer(),
+            self.phase_indirect_parameters_buffers
+                .indexed_metadata_buffer(),
             indexed_work_item_buffer.buffer(),
         ) {
             (Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer)) => {
@@ -2293,7 +2346,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
         let view_uniforms_binding = self.view_uniforms.uniforms.binding()?;
 
         match (
-            self.indirect_parameters_buffers
+            self.phase_indirect_parameters_buffers
                 .non_indexed_metadata_buffer(),
             non_indexed_work_item_buffer.buffer(),
         ) {
@@ -2346,121 +2399,134 @@ fn create_build_indirect_parameters_bind_groups(
     render_device: &RenderDevice,
     pipelines: &PreprocessPipelines,
     current_input_buffer: &Buffer,
-    indirect_parameters_buffer: &IndirectParametersBuffers,
+    indirect_parameters_buffers: &IndirectParametersBuffers,
 ) {
-    commands.insert_resource(BuildIndirectParametersBindGroups {
-        reset_indexed_indirect_batch_sets: match (
-            indirect_parameters_buffer.indexed_batch_sets_buffer(),
-        ) {
-            (Some(indexed_batch_sets_buffer),) => Some(
-                render_device.create_bind_group(
-                    "reset_indexed_indirect_batch_sets_bind_group",
-                    // The early bind group is good for the main phase and late
-                    // phase too. They bind the same buffers.
-                    &pipelines
-                        .early_phase
-                        .reset_indirect_batch_sets
-                        .bind_group_layout,
-                    &BindGroupEntries::sequential((indexed_batch_sets_buffer.as_entire_binding(),)),
-                ),
-            ),
-            _ => None,
-        },
+    let mut build_indirect_parameters_bind_groups = BuildIndirectParametersBindGroups::new();
+
+    for (phase_type_id, phase_indirect_parameters_buffer) in &indirect_parameters_buffers.buffers {
+        build_indirect_parameters_bind_groups.insert(
+            *phase_type_id,
+            PhaseBuildIndirectParametersBindGroups {
+                reset_indexed_indirect_batch_sets: match (
+                    phase_indirect_parameters_buffer.indexed_batch_sets_buffer(),
+                ) {
+                    (Some(indexed_batch_sets_buffer),) => Some(
+                        render_device.create_bind_group(
+                            "reset_indexed_indirect_batch_sets_bind_group",
+                            // The early bind group is good for the main phase and late
+                            // phase too. They bind the same buffers.
+                            &pipelines
+                                .early_phase
+                                .reset_indirect_batch_sets
+                                .bind_group_layout,
+                            &BindGroupEntries::sequential((
+                                indexed_batch_sets_buffer.as_entire_binding(),
+                            )),
+                        ),
+                    ),
+                    _ => None,
+                },
 
-        reset_non_indexed_indirect_batch_sets: match (
-            indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
-        ) {
-            (Some(non_indexed_batch_sets_buffer),) => Some(
-                render_device.create_bind_group(
-                    "reset_non_indexed_indirect_batch_sets_bind_group",
-                    // The early bind group is good for the main phase and late
-                    // phase too. They bind the same buffers.
-                    &pipelines
-                        .early_phase
-                        .reset_indirect_batch_sets
-                        .bind_group_layout,
-                    &BindGroupEntries::sequential((
-                        non_indexed_batch_sets_buffer.as_entire_binding(),
-                    )),
-                ),
-            ),
-            _ => None,
-        },
+                reset_non_indexed_indirect_batch_sets: match (
+                    phase_indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
+                ) {
+                    (Some(non_indexed_batch_sets_buffer),) => Some(
+                        render_device.create_bind_group(
+                            "reset_non_indexed_indirect_batch_sets_bind_group",
+                            // The early bind group is good for the main phase and late
+                            // phase too. They bind the same buffers.
+                            &pipelines
+                                .early_phase
+                                .reset_indirect_batch_sets
+                                .bind_group_layout,
+                            &BindGroupEntries::sequential((
+                                non_indexed_batch_sets_buffer.as_entire_binding(),
+                            )),
+                        ),
+                    ),
+                    _ => None,
+                },
 
-        build_indexed_indirect: match (
-            indirect_parameters_buffer.indexed_metadata_buffer(),
-            indirect_parameters_buffer.indexed_data_buffer(),
-            indirect_parameters_buffer.indexed_batch_sets_buffer(),
-        ) {
-            (
-                Some(indexed_indirect_parameters_metadata_buffer),
-                Some(indexed_indirect_parameters_data_buffer),
-                Some(indexed_batch_sets_buffer),
-            ) => Some(
-                render_device.create_bind_group(
-                    "build_indexed_indirect_parameters_bind_group",
-                    // The frustum culling bind group is good for occlusion culling
-                    // too. They bind the same buffers.
-                    &pipelines
-                        .gpu_frustum_culling_build_indexed_indirect_params
-                        .bind_group_layout,
-                    &BindGroupEntries::sequential((
-                        current_input_buffer.as_entire_binding(),
-                        // Don't use `as_entire_binding` here; the shader reads
-                        // the length and `RawBufferVec` overallocates.
-                        BufferBinding {
-                            buffer: indexed_indirect_parameters_metadata_buffer,
-                            offset: 0,
-                            size: NonZeroU64::new(
-                                indirect_parameters_buffer.indexed_batch_count() as u64
-                                    * size_of::<IndirectParametersMetadata>() as u64,
-                            ),
-                        },
-                        indexed_batch_sets_buffer.as_entire_binding(),
-                        indexed_indirect_parameters_data_buffer.as_entire_binding(),
-                    )),
-                ),
-            ),
-            _ => None,
-        },
+                build_indexed_indirect: match (
+                    phase_indirect_parameters_buffer.indexed_metadata_buffer(),
+                    phase_indirect_parameters_buffer.indexed_data_buffer(),
+                    phase_indirect_parameters_buffer.indexed_batch_sets_buffer(),
+                ) {
+                    (
+                        Some(indexed_indirect_parameters_metadata_buffer),
+                        Some(indexed_indirect_parameters_data_buffer),
+                        Some(indexed_batch_sets_buffer),
+                    ) => Some(
+                        render_device.create_bind_group(
+                            "build_indexed_indirect_parameters_bind_group",
+                            // The frustum culling bind group is good for occlusion culling
+                            // too. They bind the same buffers.
+                            &pipelines
+                                .gpu_frustum_culling_build_indexed_indirect_params
+                                .bind_group_layout,
+                            &BindGroupEntries::sequential((
+                                current_input_buffer.as_entire_binding(),
+                                // Don't use `as_entire_binding` here; the shader reads
+                                // the length and `RawBufferVec` overallocates.
+                                BufferBinding {
+                                    buffer: indexed_indirect_parameters_metadata_buffer,
+                                    offset: 0,
+                                    size: NonZeroU64::new(
+                                        phase_indirect_parameters_buffer.indexed_batch_count()
+                                            as u64
+                                            * size_of::<IndirectParametersMetadata>() as u64,
+                                    ),
+                                },
+                                indexed_batch_sets_buffer.as_entire_binding(),
+                                indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                            )),
+                        ),
+                    ),
+                    _ => None,
+                },
 
-        build_non_indexed_indirect: match (
-            indirect_parameters_buffer.non_indexed_metadata_buffer(),
-            indirect_parameters_buffer.non_indexed_data_buffer(),
-            indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
-        ) {
-            (
-                Some(non_indexed_indirect_parameters_metadata_buffer),
-                Some(non_indexed_indirect_parameters_data_buffer),
-                Some(non_indexed_batch_sets_buffer),
-            ) => Some(
-                render_device.create_bind_group(
-                    "build_non_indexed_indirect_parameters_bind_group",
-                    // The frustum culling bind group is good for occlusion culling
-                    // too. They bind the same buffers.
-                    &pipelines
-                        .gpu_frustum_culling_build_non_indexed_indirect_params
-                        .bind_group_layout,
-                    &BindGroupEntries::sequential((
-                        current_input_buffer.as_entire_binding(),
-                        // Don't use `as_entire_binding` here; the shader reads
-                        // the length and `RawBufferVec` overallocates.
-                        BufferBinding {
-                            buffer: non_indexed_indirect_parameters_metadata_buffer,
-                            offset: 0,
-                            size: NonZeroU64::new(
-                                indirect_parameters_buffer.non_indexed_batch_count() as u64
-                                    * size_of::<IndirectParametersMetadata>() as u64,
-                            ),
-                        },
-                        non_indexed_batch_sets_buffer.as_entire_binding(),
-                        non_indexed_indirect_parameters_data_buffer.as_entire_binding(),
-                    )),
-                ),
-            ),
-            _ => None,
-        },
-    });
+                build_non_indexed_indirect: match (
+                    phase_indirect_parameters_buffer.non_indexed_metadata_buffer(),
+                    phase_indirect_parameters_buffer.non_indexed_data_buffer(),
+                    phase_indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
+                ) {
+                    (
+                        Some(non_indexed_indirect_parameters_metadata_buffer),
+                        Some(non_indexed_indirect_parameters_data_buffer),
+                        Some(non_indexed_batch_sets_buffer),
+                    ) => Some(
+                        render_device.create_bind_group(
+                            "build_non_indexed_indirect_parameters_bind_group",
+                            // The frustum culling bind group is good for occlusion culling
+                            // too. They bind the same buffers.
+                            &pipelines
+                                .gpu_frustum_culling_build_non_indexed_indirect_params
+                                .bind_group_layout,
+                            &BindGroupEntries::sequential((
+                                current_input_buffer.as_entire_binding(),
+                                // Don't use `as_entire_binding` here; the shader reads
+                                // the length and `RawBufferVec` overallocates.
+                                BufferBinding {
+                                    buffer: non_indexed_indirect_parameters_metadata_buffer,
+                                    offset: 0,
+                                    size: NonZeroU64::new(
+                                        phase_indirect_parameters_buffer.non_indexed_batch_count()
+                                            as u64
+                                            * size_of::<IndirectParametersMetadata>() as u64,
+                                    ),
+                                },
+                                non_indexed_batch_sets_buffer.as_entire_binding(),
+                                non_indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                            )),
+                        ),
+                    ),
+                    _ => None,
+                },
+            },
+        );
+    }
+
+    commands.insert_resource(build_indirect_parameters_bind_groups);
 }
 
 /// Writes the information needed to do GPU mesh culling to the GPU.
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index e716eb166e628..cd8c40c49d8df 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -21,7 +21,7 @@ use bevy_render::{
         gpu_preprocessing::{
             self, GpuPreprocessingSupport, IndirectBatchSet, IndirectParametersBuffers,
             IndirectParametersIndexed, IndirectParametersMetadata, IndirectParametersNonIndexed,
-            InstanceInputUniformBuffer,
+            InstanceInputUniformBuffer, UntypedPhaseIndirectParametersBuffers,
         },
         no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching,
     },
@@ -43,7 +43,8 @@ use bevy_render::{
     Extract,
 };
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::{default, Parallel};
+use bevy_utils::{default, Parallel, TypeIdMap};
+use core::any::TypeId;
 use core::mem::size_of;
 use material_bind_groups::MaterialBindingId;
 use render::skin::{self, SkinIndex};
@@ -79,13 +80,27 @@ use smallvec::{smallvec, SmallVec};
 use static_assertions::const_assert_eq;
 
 /// Provides support for rendering 3D meshes.
-#[derive(Default)]
 pub struct MeshRenderPlugin {
     /// Whether we're building [`MeshUniform`]s on GPU.
     ///
     /// This requires compute shader support and so will be forcibly disabled if
     /// the platform doesn't support those.
     pub use_gpu_instance_buffer_builder: bool,
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameters: bool,
+}
+
+impl MeshRenderPlugin {
+    pub fn new(allow_copies_from_indirect_parameters: bool) -> MeshRenderPlugin {
+        MeshRenderPlugin {
+            use_gpu_instance_buffer_builder: false,
+            allow_copies_from_indirect_parameters,
+        }
+    }
 }
 
 pub const FORWARD_IO_HANDLE: Handle<Shader> = weak_handle!("38111de1-6e35-4dbb-877b-7b6f9334baf6");
@@ -166,18 +181,31 @@ impl Plugin for MeshRenderPlugin {
             (no_automatic_skin_batching, no_automatic_morph_batching),
         )
         .add_plugins((
-            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
-            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::default(),
-            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
+            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
+            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
+            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
+            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
+            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
+            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::new(
+                self.allow_copies_from_indirect_parameters,
+            ),
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
             render_app
-                .init_resource::<MeshBindGroups>()
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniforms>()
                 .init_resource::<MorphIndices>()
@@ -202,7 +230,7 @@ impl Plugin for MeshRenderPlugin {
                         set_mesh_motion_vector_flags.in_set(RenderSet::PrepareMeshes),
                         prepare_skins.in_set(RenderSet::PrepareResources),
                         prepare_morphs.in_set(RenderSet::PrepareResources),
-                        prepare_mesh_bind_group.in_set(RenderSet::PrepareBindGroups),
+                        prepare_mesh_bind_groups.in_set(RenderSet::PrepareBindGroups),
                         prepare_mesh_view_bind_groups
                             .in_set(RenderSet::PrepareBindGroups)
                             .after(prepare_oit_buffers),
@@ -238,12 +266,14 @@ impl Plugin for MeshRenderPlugin {
 
             if use_gpu_instance_buffer_builder {
                 render_app
-                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<
+                        MeshUniform,
+                        MeshInputUniform
+                    >>()
                     .init_resource::<RenderMeshInstanceGpuQueues>()
                     .add_systems(
                         ExtractSchedule,
-                        extract_meshes_for_gpu_building
-                            .in_set(ExtractMeshesSet),
+                        extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
                     )
                     .add_systems(
                         Render,
@@ -1933,7 +1963,7 @@ impl GetFullBatchData for MeshPipeline {
         indexed: bool,
         base_output_index: u32,
         batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffer: &mut IndirectParametersBuffers,
+        phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
         indirect_parameters_offset: u32,
     ) {
         let indirect_parameters = IndirectParametersMetadata {
@@ -1948,9 +1978,10 @@ impl GetFullBatchData for MeshPipeline {
         };
 
         if indexed {
-            indirect_parameters_buffer.set_indexed(indirect_parameters_offset, indirect_parameters);
+            phase_indirect_parameters_buffers
+                .set_indexed(indirect_parameters_offset, indirect_parameters);
         } else {
-            indirect_parameters_buffer
+            phase_indirect_parameters_buffers
                 .set_non_indexed(indirect_parameters_offset, indirect_parameters);
         }
     }
@@ -2544,9 +2575,12 @@ impl SpecializedMeshPipeline for MeshPipeline {
     }
 }
 
-/// Bind groups for meshes currently loaded.
-#[derive(Resource, Default)]
-pub struct MeshBindGroups {
+/// The bind groups for meshes currently loaded.
+///
+/// If GPU mesh preprocessing isn't in use, these are global to the scene. If
+/// GPU mesh preprocessing is in use, these are specific to a single phase.
+#[derive(Default)]
+pub struct MeshPhaseBindGroups {
     model_only: Option<BindGroup>,
     skinned: Option<MeshBindGroupPair>,
     morph_targets: HashMap<AssetId<Mesh>, MeshBindGroupPair>,
@@ -2558,7 +2592,18 @@ pub struct MeshBindGroupPair {
     no_motion_vectors: BindGroup,
 }
 
-impl MeshBindGroups {
+/// All bind groups for meshes currently loaded.
+#[derive(Resource)]
+pub enum MeshBindGroups {
+    /// The bind groups for the meshes for the entire scene, if GPU mesh
+    /// preprocessing isn't in use.
+    CpuPreprocessing(MeshPhaseBindGroups),
+    /// A mapping from the type ID of a phase (e.g. [`Opaque3d`]) to the mesh
+    /// bind groups for that phase.
+    GpuPreprocessing(TypeIdMap<MeshPhaseBindGroups>),
+}
+
+impl MeshPhaseBindGroups {
     pub fn reset(&mut self) {
         self.model_only = None;
         self.skinned = None;
@@ -2600,9 +2645,10 @@ impl MeshBindGroupPair {
     }
 }
 
-pub fn prepare_mesh_bind_group(
+/// Creates the per-mesh bind groups for each type of mesh and each phase.
+pub fn prepare_mesh_bind_groups(
+    mut commands: Commands,
     meshes: Res<RenderAssets<RenderMesh>>,
-    mut groups: ResMut<MeshBindGroups>,
     mesh_pipeline: Res<MeshPipeline>,
     render_device: Res<RenderDevice>,
     cpu_batched_instance_buffer: Option<
@@ -2615,24 +2661,80 @@ pub fn prepare_mesh_bind_group(
     weights_uniform: Res<MorphUniforms>,
     mut render_lightmaps: ResMut<RenderLightmaps>,
 ) {
-    groups.reset();
+    // CPU mesh preprocessing path.
+    if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        if let Some(instance_data_binding) = cpu_batched_instance_buffer
+            .into_inner()
+            .instance_data_binding()
+        {
+            // In this path, we only have a single set of bind groups for all phases.
+            let cpu_preprocessing_mesh_bind_groups = prepare_mesh_bind_groups_for_phase(
+                instance_data_binding,
+                &meshes,
+                &mesh_pipeline,
+                &render_device,
+                &skins_uniform,
+                &weights_uniform,
+                &mut render_lightmaps,
+            );
 
+            commands.insert_resource(MeshBindGroups::CpuPreprocessing(
+                cpu_preprocessing_mesh_bind_groups,
+            ));
+            return;
+        }
+    }
+
+    // GPU mesh preprocessing path.
+    if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        let mut gpu_preprocessing_mesh_bind_groups = TypeIdMap::default();
+
+        // Loop over each phase.
+        for (phase_type_id, batched_phase_instance_buffers) in
+            &gpu_batched_instance_buffers.phase_instance_buffers
+        {
+            let Some(instance_data_binding) =
+                batched_phase_instance_buffers.instance_data_binding()
+            else {
+                continue;
+            };
+
+            let mesh_phase_bind_groups = prepare_mesh_bind_groups_for_phase(
+                instance_data_binding,
+                &meshes,
+                &mesh_pipeline,
+                &render_device,
+                &skins_uniform,
+                &weights_uniform,
+                &mut render_lightmaps,
+            );
+
+            gpu_preprocessing_mesh_bind_groups.insert(*phase_type_id, mesh_phase_bind_groups);
+        }
+
+        commands.insert_resource(MeshBindGroups::GpuPreprocessing(
+            gpu_preprocessing_mesh_bind_groups,
+        ));
+    }
+}
+
+/// Creates the per-mesh bind groups for each type of mesh, for a single phase.
+fn prepare_mesh_bind_groups_for_phase(
+    model: BindingResource,
+    meshes: &RenderAssets<RenderMesh>,
+    mesh_pipeline: &MeshPipeline,
+    render_device: &RenderDevice,
+    skins_uniform: &SkinUniforms,
+    weights_uniform: &MorphUniforms,
+    render_lightmaps: &mut RenderLightmaps,
+) -> MeshPhaseBindGroups {
     let layouts = &mesh_pipeline.mesh_layouts;
 
-    let model = if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
-        cpu_batched_instance_buffer
-            .into_inner()
-            .instance_data_binding()
-    } else if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
-        gpu_batched_instance_buffers
-            .into_inner()
-            .instance_data_binding()
-    } else {
-        return;
+    // TODO: Reuse allocations.
+    let mut groups = MeshPhaseBindGroups {
+        model_only: Some(layouts.model_only(render_device, &model)),
+        ..default()
     };
-    let Some(model) = model else { return };
-
-    groups.model_only = Some(layouts.model_only(&render_device, &model));
 
     // Create the skinned mesh bind group with the current and previous buffers
     // (the latter being for motion vector computation). If there's no previous
@@ -2641,8 +2743,8 @@ pub fn prepare_mesh_bind_group(
     if let Some(skin) = skin {
         let prev_skin = skins_uniform.prev_buffer.buffer().unwrap_or(skin);
         groups.skinned = Some(MeshBindGroupPair {
-            motion_vectors: layouts.skinned_motion(&render_device, &model, skin, prev_skin),
-            no_motion_vectors: layouts.skinned(&render_device, &model, skin),
+            motion_vectors: layouts.skinned_motion(render_device, &model, skin, prev_skin),
+            no_motion_vectors: layouts.skinned(render_device, &model, skin),
         });
     }
 
@@ -2657,7 +2759,7 @@ pub fn prepare_mesh_bind_group(
                         let prev_skin = skins_uniform.prev_buffer.buffer().unwrap_or(skin);
                         MeshBindGroupPair {
                             motion_vectors: layouts.morphed_skinned_motion(
-                                &render_device,
+                                render_device,
                                 &model,
                                 skin,
                                 weights,
@@ -2666,7 +2768,7 @@ pub fn prepare_mesh_bind_group(
                                 prev_weights,
                             ),
                             no_motion_vectors: layouts.morphed_skinned(
-                                &render_device,
+                                render_device,
                                 &model,
                                 skin,
                                 weights,
@@ -2676,18 +2778,13 @@ pub fn prepare_mesh_bind_group(
                     }
                     None => MeshBindGroupPair {
                         motion_vectors: layouts.morphed_motion(
-                            &render_device,
+                            render_device,
                             &model,
                             weights,
                             targets,
                             prev_weights,
                         ),
-                        no_motion_vectors: layouts.morphed(
-                            &render_device,
-                            &model,
-                            weights,
-                            targets,
-                        ),
+                        no_motion_vectors: layouts.morphed(render_device, &model, weights, targets),
                     },
                 };
                 groups.morph_targets.insert(id, bind_group_pair);
@@ -2700,9 +2797,11 @@ pub fn prepare_mesh_bind_group(
     for (lightmap_slab_id, lightmap_slab) in render_lightmaps.slabs.iter_mut().enumerate() {
         groups.lightmaps.insert(
             LightmapSlabIndex(NonMaxU32::new(lightmap_slab_id as u32).unwrap()),
-            layouts.lightmapped(&render_device, &model, lightmap_slab, bindless_supported),
+            layouts.lightmapped(render_device, &model, lightmap_slab, bindless_supported),
         );
     }
+
+    groups
 }
 
 pub struct SetMeshViewBindGroup<const I: usize>;
@@ -2806,7 +2905,20 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
             .get(entity)
             .map(|render_lightmap| render_lightmap.slab_index);
 
-        let Some(bind_group) = bind_groups.get(
+        let Some(mesh_phase_bind_groups) = (match *bind_groups {
+            MeshBindGroups::CpuPreprocessing(ref mesh_phase_bind_groups) => {
+                Some(mesh_phase_bind_groups)
+            }
+            MeshBindGroups::GpuPreprocessing(ref mesh_phase_bind_groups) => {
+                mesh_phase_bind_groups.get(&TypeId::of::<P>())
+            }
+        }) else {
+            // This is harmless if e.g. we're rendering the `Shadow` phase and
+            // there weren't any shadows.
+            return RenderCommandResult::Success;
+        };
+
+        let Some(bind_group) = mesh_phase_bind_groups.get(
             mesh_asset_id,
             lightmap_slab_index,
             is_skinned,
@@ -2958,9 +3070,18 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                         // Look up the indirect parameters buffer, as well as
                         // the buffer we're going to use for
                         // `multi_draw_indexed_indirect_count` (if available).
+                        let Some(phase_indirect_parameters_buffers) =
+                            indirect_parameters_buffer.buffers.get(&TypeId::of::<P>())
+                        else {
+                            warn!(
+                                "Not rendering mesh because indexed indirect parameters buffer \
+                                 wasn't present for this phase",
+                            );
+                            return RenderCommandResult::Skip;
+                        };
                         let (Some(indirect_parameters_buffer), Some(batch_sets_buffer)) = (
-                            indirect_parameters_buffer.indexed_data_buffer(),
-                            indirect_parameters_buffer.indexed_batch_sets_buffer(),
+                            phase_indirect_parameters_buffers.indexed_data_buffer(),
+                            phase_indirect_parameters_buffers.indexed_batch_sets_buffer(),
                         ) else {
                             warn!(
                                 "Not rendering mesh because indexed indirect parameters buffer \
@@ -3015,9 +3136,18 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                     // Look up the indirect parameters buffer, as well as the
                     // buffer we're going to use for
                     // `multi_draw_indirect_count` (if available).
+                    let Some(phase_indirect_parameters_buffers) =
+                        indirect_parameters_buffer.buffers.get(&TypeId::of::<P>())
+                    else {
+                        warn!(
+                            "Not rendering mesh because indexed indirect parameters buffer \
+                                 wasn't present for this phase",
+                        );
+                        return RenderCommandResult::Skip;
+                    };
                     let (Some(indirect_parameters_buffer), Some(batch_sets_buffer)) = (
-                        indirect_parameters_buffer.non_indexed_data_buffer(),
-                        indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
+                        phase_indirect_parameters_buffers.non_indexed_data_buffer(),
+                        phase_indirect_parameters_buffers.non_indexed_batch_sets_buffer(),
                     ) else {
                         warn!(
                             "Not rendering mesh because non-indexed indirect parameters buffer \
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index ccf227fd7401f..2e8cf0208f1e8 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -1,6 +1,6 @@
 //! Batching functionality when GPU preprocessing is in use.
 
-use core::any::TypeId;
+use core::{any::TypeId, marker::PhantomData, mem};
 
 use bevy_app::{App, Plugin};
 use bevy_ecs::{
@@ -24,9 +24,9 @@ use crate::{
     experimental::occlusion_culling::OcclusionCulling,
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
-        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItemBatchSetKey as _,
-        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
-        ViewBinnedRenderPhases, ViewSortedRenderPhases,
+        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem,
+        PhaseItemBatchSetKey as _, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
+        UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases,
     },
     render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
     renderer::{RenderAdapter, RenderDevice, RenderQueue},
@@ -147,18 +147,6 @@ where
     BD: GpuArrayBufferable + Sync + Send + 'static,
     BDI: Pod + Default,
 {
-    /// A storage area for the buffer data that the GPU compute shader is
-    /// expected to write to.
-    ///
-    /// There will be one entry for each index.
-    pub data_buffer: UninitBufferVec<BD>,
-
-    /// The index of the buffer data in the current input buffer that
-    /// corresponds to each instance.
-    ///
-    /// This is keyed off each view. Each view has a separate buffer.
-    pub work_item_buffers: HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
-
     /// The uniform data inputs for the current frame.
     ///
     /// These are uploaded during the extraction phase.
@@ -173,6 +161,81 @@ where
     /// corresponding buffer data input uniform in this list.
     pub previous_input_buffer: InstanceInputUniformBuffer<BDI>,
 
+    /// The data needed to render buffers for each phase.
+    ///
+    /// The keys of this map are the type IDs of each phase: e.g. `Opaque3d`,
+    /// `AlphaMask3d`, etc.
+    pub phase_instance_buffers: TypeIdMap<UntypedPhaseBatchedInstanceBuffers<BD>>,
+}
+
+impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod + Sync + Send + Default + 'static,
+{
+    fn default() -> Self {
+        BatchedInstanceBuffers {
+            current_input_buffer: InstanceInputUniformBuffer::new(),
+            previous_input_buffer: InstanceInputUniformBuffer::new(),
+            phase_instance_buffers: HashMap::default(),
+        }
+    }
+}
+
+/// The GPU buffers holding the data needed to render batches for a single
+/// phase.
+///
+/// These are split out per phase so that we can run the phases in parallel.
+/// This is the version of the structure that has a type parameter, which
+/// enables Bevy's scheduler to run the batching operations for the different
+/// phases in parallel.
+///
+/// See the documentation for [`BatchedInstanceBuffers`] for more information.
+#[derive(Resource)]
+pub struct PhaseBatchedInstanceBuffers<PI, BD>
+where
+    PI: PhaseItem,
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+{
+    /// The buffers for this phase.
+    pub buffers: UntypedPhaseBatchedInstanceBuffers<BD>,
+    phantom: PhantomData<PI>,
+}
+
+impl<PI, BD> Default for PhaseBatchedInstanceBuffers<PI, BD>
+where
+    PI: PhaseItem,
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+{
+    fn default() -> Self {
+        PhaseBatchedInstanceBuffers {
+            buffers: UntypedPhaseBatchedInstanceBuffers::default(),
+            phantom: PhantomData,
+        }
+    }
+}
+
+/// The GPU buffers holding the data needed to render batches for a single
+/// phase, without a type parameter for that phase.
+///
+/// Since this structure doesn't have a type parameter, it can be placed in
+/// [`BatchedInstanceBuffers::phase_instance_buffers`].
+pub struct UntypedPhaseBatchedInstanceBuffers<BD>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+{
+    /// A storage area for the buffer data that the GPU compute shader is
+    /// expected to write to.
+    ///
+    /// There will be one entry for each index.
+    pub data_buffer: UninitBufferVec<BD>,
+
+    /// The index of the buffer data in the current input buffer that
+    /// corresponds to each instance.
+    ///
+    /// This is keyed off each view. Each view has a separate buffer.
+    pub work_item_buffers: HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
+
     /// A buffer that holds the number of indexed meshes that weren't visible in
     /// the previous frame, when GPU occlusion culling is in use.
     ///
@@ -351,11 +414,11 @@ pub struct GpuOcclusionCullingWorkItemBuffers {
     /// The buffer of work items corresponding to non-indexed meshes.
     pub late_non_indexed: UninitBufferVec<PreprocessWorkItem>,
     /// The offset into the
-    /// [`BatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`]
+    /// [`UntypedPhaseBatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`]
     /// where this view's indirect dispatch counts for indexed meshes live.
     pub late_indirect_parameters_indexed_offset: u32,
     /// The offset into the
-    /// [`BatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`]
+    /// [`UntypedPhaseBatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`]
     /// where this view's indirect dispatch counts for non-indexed meshes live.
     pub late_indirect_parameters_non_indexed_offset: u32,
 }
@@ -409,7 +472,7 @@ impl Default for LatePreprocessWorkItemIndirectParameters {
 /// You may need to call this function if you're implementing your own custom
 /// render phases. See the `specialized_mesh_pipeline` example.
 pub fn get_or_create_work_item_buffer<'a, I>(
-    work_item_buffers: &'a mut HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
+    work_item_buffers: &'a mut HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
     view: RetainedViewEntity,
     no_indirect_drawing: bool,
     enable_gpu_occlusion_culling: bool,
@@ -417,11 +480,7 @@ pub fn get_or_create_work_item_buffer<'a, I>(
 where
     I: 'static,
 {
-    let preprocess_work_item_buffers = match work_item_buffers
-        .entry(view)
-        .or_default()
-        .entry(TypeId::of::<I>())
-    {
+    let preprocess_work_item_buffers = match work_item_buffers.entry(view) {
         Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
         Entry::Vacant(vacant_entry) => {
             if no_indirect_drawing {
@@ -702,6 +761,68 @@ pub struct IndirectBatchSet {
 /// These buffers will remain empty if indirect drawing isn't in use.
 #[derive(Resource)]
 pub struct IndirectParametersBuffers {
+    /// A mapping from a phase type ID to the indirect parameters buffers for
+    /// that phase.
+    ///
+    /// Examples of phase type IDs are `Opaque3d` and `AlphaMask3d`.
+    pub buffers: TypeIdMap<UntypedPhaseIndirectParametersBuffers>,
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameter_buffers: bool,
+}
+
+impl IndirectParametersBuffers {
+    /// Initializes a new [`IndirectParametersBuffers`] resource.
+    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers {
+        IndirectParametersBuffers {
+            buffers: TypeIdMap::default(),
+            allow_copies_from_indirect_parameter_buffers,
+        }
+    }
+}
+
+/// The buffers containing all the information that indirect draw commands use
+/// to draw the scene, for a single phase.
+///
+/// This is the version of the structure that has a type parameter, so that the
+/// batching for different phases can run in parallel.
+///
+/// See the [`IndirectParametersBuffers`] documentation for more information.
+#[derive(Resource)]
+pub struct PhaseIndirectParametersBuffers<PI>
+where
+    PI: PhaseItem,
+{
+    /// The indirect draw buffers for the phase.
+    pub buffers: UntypedPhaseIndirectParametersBuffers,
+    phantom: PhantomData<PI>,
+}
+
+impl<PI> PhaseIndirectParametersBuffers<PI>
+where
+    PI: PhaseItem,
+{
+    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> Self {
+        PhaseIndirectParametersBuffers {
+            buffers: UntypedPhaseIndirectParametersBuffers::new(
+                allow_copies_from_indirect_parameter_buffers,
+            ),
+            phantom: PhantomData,
+        }
+    }
+}
+
+/// The buffers containing all the information that indirect draw commands use
+/// to draw the scene, for a single phase.
+///
+/// This is the version of the structure that doesn't have a type parameter, so
+/// that it can be inserted into [`IndirectParametersBuffers::buffers`]
+///
+/// See the [`IndirectParametersBuffers`] documentation for more information.
+pub struct UntypedPhaseIndirectParametersBuffers {
     /// The GPU buffer that stores the indirect draw parameters for non-indexed
     /// meshes.
     ///
@@ -751,15 +872,17 @@ pub struct IndirectParametersBuffers {
     indexed_batch_sets: RawBufferVec<IndirectBatchSet>,
 }
 
-impl IndirectParametersBuffers {
+impl UntypedPhaseIndirectParametersBuffers {
     /// Creates the indirect parameters buffers.
-    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers {
+    pub fn new(
+        allow_copies_from_indirect_parameter_buffers: bool,
+    ) -> UntypedPhaseIndirectParametersBuffers {
         let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT;
         if allow_copies_from_indirect_parameter_buffers {
             indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC;
         }
 
-        IndirectParametersBuffers {
+        UntypedPhaseIndirectParametersBuffers {
             non_indexed_data: UninitBufferVec::new(indirect_parameter_buffer_usages),
             non_indexed_metadata: RawBufferVec::new(BufferUsages::STORAGE),
             non_indexed_batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages),
@@ -952,6 +1075,15 @@ impl IndirectParametersBuffers {
     pub fn get_next_batch_set_index(&self, indexed: bool) -> Option<NonMaxU32> {
         NonMaxU32::new(self.batch_set_count(indexed) as u32)
     }
+
+    pub fn clear(&mut self) {
+        self.indexed_data.clear();
+        self.indexed_metadata.clear();
+        self.indexed_batch_sets.clear();
+        self.non_indexed_data.clear();
+        self.non_indexed_metadata.clear();
+        self.non_indexed_batch_sets.clear();
+    }
 }
 
 impl Default for IndirectParametersBuffers {
@@ -1007,11 +1139,24 @@ where
 {
     /// Creates new buffers.
     pub fn new() -> Self {
-        BatchedInstanceBuffers {
+        Self::default()
+    }
+
+    /// Clears out the buffers in preparation for a new frame.
+    pub fn clear(&mut self) {
+        // TODO: Don't do this.
+        self.phase_instance_buffers.clear();
+    }
+}
+
+impl<BD> UntypedPhaseBatchedInstanceBuffers<BD>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+{
+    pub fn new() -> Self {
+        UntypedPhaseBatchedInstanceBuffers {
             data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
             work_item_buffers: HashMap::default(),
-            current_input_buffer: InstanceInputUniformBuffer::new(),
-            previous_input_buffer: InstanceInputUniformBuffer::new(),
             late_indexed_indirect_parameters_buffer: RawBufferVec::new(
                 BufferUsages::STORAGE | BufferUsages::INDIRECT,
             ),
@@ -1039,17 +1184,14 @@ where
         // Clear each individual set of buffers, but don't depopulate the hash
         // table. We want to avoid reallocating these vectors every frame.
         for view_work_item_buffers in self.work_item_buffers.values_mut() {
-            for phase_work_item_buffers in view_work_item_buffers.values_mut() {
-                phase_work_item_buffers.clear();
-            }
+            view_work_item_buffers.clear();
         }
     }
 }
 
-impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
+impl<BD> Default for UntypedPhaseBatchedInstanceBuffers<BD>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
-    BDI: Pod + Default + Sync + Send + 'static,
 {
     fn default() -> Self {
         Self::new()
@@ -1098,7 +1240,7 @@ where
         self,
         instance_end_index: u32,
         phase: &mut SortedRenderPhase<I>,
-        indirect_parameters_buffers: &mut IndirectParametersBuffers,
+        phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
     ) where
         I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
     {
@@ -1114,7 +1256,7 @@ where
             None => PhaseItemExtraIndex::None,
         };
         if let Some(indirect_parameters_index) = self.indirect_parameters_index {
-            indirect_parameters_buffers
+            phase_indirect_parameters_buffers
                 .add_batch_set(self.indexed, indirect_parameters_index.into());
         }
     }
@@ -1156,17 +1298,23 @@ pub fn delete_old_work_item_buffers<GFBD>(
         .iter()
         .map(|extracted_view| extracted_view.retained_view_entity)
         .collect();
-    gpu_batched_instance_buffers
-        .work_item_buffers
-        .retain(|retained_view_entity, _| retained_view_entities.contains(retained_view_entity));
+    for phase_instance_buffers in gpu_batched_instance_buffers
+        .phase_instance_buffers
+        .values_mut()
+    {
+        phase_instance_buffers
+            .work_item_buffers
+            .retain(|retained_view_entity, _| {
+                retained_view_entities.contains(retained_view_entity)
+            });
+    }
 }
 
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// is in use. This means comparing metadata needed to draw each phase item and
 /// trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
-    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
+    indirect_parameters_buffers: Res<IndirectParametersBuffers>,
     mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
     mut views: Query<(
         &ExtractedView,
@@ -1178,14 +1326,19 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
+    let mut phase_batched_instance_buffers =
+        UntypedPhaseBatchedInstanceBuffers::<GFBD::BufferData>::new();
+    let mut phase_indirect_parameters_buffers = UntypedPhaseIndirectParametersBuffers::new(
+        indirect_parameters_buffers.allow_copies_from_indirect_parameter_buffers,
+    );
+
     // We only process GPU-built batch data in this function.
-    let BatchedInstanceBuffers {
+    let UntypedPhaseBatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ref mut late_indexed_indirect_parameters_buffer,
         ref mut late_non_indexed_indirect_parameters_buffer,
-        ..
-    } = gpu_array_buffer.into_inner();
+    } = phase_batched_instance_buffers;
 
     for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
         let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else {
@@ -1231,7 +1384,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
                     batch.flush(
                         data_buffer.len() as u32,
                         phase,
-                        &mut indirect_parameters_buffers,
+                        &mut phase_indirect_parameters_buffers,
                     );
                 }
 
@@ -1257,15 +1410,15 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
             if !can_batch {
                 // Break a batch if we need to.
                 if let Some(batch) = batch.take() {
-                    batch.flush(output_index, phase, &mut indirect_parameters_buffers);
+                    batch.flush(output_index, phase, &mut phase_indirect_parameters_buffers);
                 }
 
                 let indirect_parameters_index = if no_indirect_drawing {
                     None
                 } else if item_is_indexed {
-                    Some(indirect_parameters_buffers.allocate_indexed(1))
+                    Some(phase_indirect_parameters_buffers.allocate_indexed(1))
                 } else {
-                    Some(indirect_parameters_buffers.allocate_non_indexed(1))
+                    Some(phase_indirect_parameters_buffers.allocate_non_indexed(1))
                 };
 
                 // Start a new batch.
@@ -1275,7 +1428,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
                         item_is_indexed,
                         output_index,
                         None,
-                        &mut indirect_parameters_buffers,
+                        &mut phase_indirect_parameters_buffers,
                         indirect_parameters_index,
                     );
                 };
@@ -1317,7 +1470,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
             batch.flush(
                 data_buffer.len() as u32,
                 phase,
-                &mut indirect_parameters_buffers,
+                &mut phase_indirect_parameters_buffers,
             );
         }
     }
@@ -1325,8 +1478,8 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
-    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
+    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
+    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
     mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
     mut views: Query<
         (
@@ -1343,13 +1496,12 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 {
     let system_param_item = param.into_inner();
 
-    let BatchedInstanceBuffers {
+    let UntypedPhaseBatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ref mut late_indexed_indirect_parameters_buffer,
         ref mut late_non_indexed_indirect_parameters_buffer,
-        ..
-    } = gpu_array_buffer.into_inner();
+    } = phase_batched_instance_buffers.buffers;
 
     for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
         let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else {
@@ -1376,8 +1528,10 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 
         for batch_set_key in &phase.multidrawable_mesh_keys {
             let mut batch_set = None;
-            let indirect_parameters_base =
-                indirect_parameters_buffers.batch_count(batch_set_key.indexed()) as u32;
+            let indirect_parameters_base = phase_indirect_parameters_buffers
+                .buffers
+                .batch_count(batch_set_key.indexed())
+                as u32;
             for (bin_key, bin) in &phase.multidrawable_mesh_values[batch_set_key] {
                 let first_output_index = data_buffer.len() as u32;
                 let mut batch: Option<BinnedRenderPhaseBatch> = None;
@@ -1413,9 +1567,11 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 
                         None => {
                             // Start a new batch, in indirect mode.
-                            let indirect_parameters_index =
-                                indirect_parameters_buffers.allocate(batch_set_key.indexed(), 1);
-                            let batch_set_index = indirect_parameters_buffers
+                            let indirect_parameters_index = phase_indirect_parameters_buffers
+                                .buffers
+                                .allocate(batch_set_key.indexed(), 1);
+                            let batch_set_index = phase_indirect_parameters_buffers
+                                .buffers
                                 .get_next_batch_set_index(batch_set_key.indexed());
 
                             GFBD::write_batch_indirect_parameters_metadata(
@@ -1423,7 +1579,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                 batch_set_key.indexed(),
                                 output_index,
                                 batch_set_index,
-                                &mut indirect_parameters_buffers,
+                                &mut phase_indirect_parameters_buffers.buffers,
                                 indirect_parameters_index,
                             );
                             work_item_buffer.push(
@@ -1452,7 +1608,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                 first_batch: batch,
                                 batch_count: 1,
                                 bin_key: bin_key.clone(),
-                                index: indirect_parameters_buffers
+                                index: phase_indirect_parameters_buffers
+                                    .buffers
                                     .batch_set_count(batch_set_key.indexed())
                                     as u32,
                             });
@@ -1469,7 +1626,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
             {
                 if let Some(batch_set) = batch_set {
                     batch_sets.push(batch_set);
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
+                        .buffers
                         .add_batch_set(batch_set_key.indexed(), indirect_parameters_base);
                 }
             }
@@ -1522,17 +1680,19 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 
                     None if !no_indirect_drawing => {
                         // Start a new batch, in indirect mode.
-                        let indirect_parameters_index =
-                            indirect_parameters_buffers.allocate(key.0.indexed(), 1);
-                        let batch_set_index =
-                            indirect_parameters_buffers.get_next_batch_set_index(key.0.indexed());
+                        let indirect_parameters_index = phase_indirect_parameters_buffers
+                            .buffers
+                            .allocate(key.0.indexed(), 1);
+                        let batch_set_index = phase_indirect_parameters_buffers
+                            .buffers
+                            .get_next_batch_set_index(key.0.indexed());
 
                         GFBD::write_batch_indirect_parameters_metadata(
                             input_index.into(),
                             key.0.indexed(),
                             output_index,
                             batch_set_index,
-                            &mut indirect_parameters_buffers,
+                            &mut phase_indirect_parameters_buffers.buffers,
                             indirect_parameters_index,
                         );
                         work_item_buffer.push(
@@ -1589,7 +1749,9 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                             first_batch: batch,
                             batch_count: 1,
                             bin_key: key.1.clone(),
-                            index: indirect_parameters_buffers.batch_set_count(key.0.indexed())
+                            index: phase_indirect_parameters_buffers
+                                .buffers
+                                .batch_set_count(key.0.indexed())
                                 as u32,
                         });
                     }
@@ -1606,12 +1768,14 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 None
             } else if key.0.indexed() {
                 Some(
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
+                        .buffers
                         .allocate_indexed(unbatchables.entities.len() as u32),
                 )
             } else {
                 Some(
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
+                        .buffers
                         .allocate_non_indexed(unbatchables.entities.len() as u32),
                 )
             };
@@ -1631,7 +1795,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                         key.0.indexed(),
                         output_index,
                         None,
-                        &mut indirect_parameters_buffers,
+                        &mut phase_indirect_parameters_buffers.buffers,
                         *indirect_parameters_index,
                     );
                     work_item_buffer.push(
@@ -1651,7 +1815,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                 batch_set_index: None,
                             },
                         });
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
+                        .buffers
                         .add_batch_set(key.0.indexed(), *indirect_parameters_index);
                     *indirect_parameters_index += 1;
                 } else {
@@ -1675,6 +1840,64 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
     }
 }
 
+/// A system that gathers up the per-phase GPU buffers and inserts them into the
+/// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
+///
+/// This runs after the [`batch_and_prepare_binned_render_phase`] or
+/// [`batch_and_prepare_sorted_render_phase`] systems. It takes the per-phase
+/// [`PhaseBatchedInstanceBuffers`] and [`PhaseIndirectParametersBuffers`]
+/// resources and inserts them into the global [`BatchedInstanceBuffers`] and
+/// [`IndirectParametersBuffers`] tables.
+///
+/// This system exists so that the [`batch_and_prepare_binned_render_phase`] and
+/// [`batch_and_prepare_sorted_render_phase`] can run in parallel with one
+/// another. If those two systems manipulated [`BatchedInstanceBuffers`] and
+/// [`IndirectParametersBuffers`] directly, then they wouldn't be able to run in
+/// parallel.
+pub fn collect_buffers_for_phase<PI, GFBD>(
+    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<PI, GFBD::BufferData>>,
+    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<PI>>,
+    mut batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+    >,
+    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
+) where
+    PI: PhaseItem,
+    GFBD: GetFullBatchData + Send + Sync + 'static,
+{
+    // Insert the `PhaseBatchedInstanceBuffers` into the global table. Replace
+    // the contents of the per-phase resource with the old batched instance
+    // buffers in order to reuse allocations.
+    let untyped_phase_batched_instance_buffers =
+        mem::take(&mut phase_batched_instance_buffers.buffers);
+    if let Some(mut old_untyped_phase_batched_instance_buffers) = batched_instance_buffers
+        .phase_instance_buffers
+        .insert(TypeId::of::<PI>(), untyped_phase_batched_instance_buffers)
+    {
+        old_untyped_phase_batched_instance_buffers.clear();
+        phase_batched_instance_buffers.buffers = old_untyped_phase_batched_instance_buffers;
+    }
+
+    // Insert the `PhaseIndirectParametersBuffers` into the global table.
+    // Replace the contents of the per-phase resource with the old indirect
+    // parameters buffers in order to reuse allocations.
+    let untyped_phase_indirect_parameters_buffers = mem::replace(
+        &mut phase_indirect_parameters_buffers.buffers,
+        UntypedPhaseIndirectParametersBuffers::new(
+            indirect_parameters_buffers.allow_copies_from_indirect_parameter_buffers,
+        ),
+    );
+    if let Some(mut old_untyped_phase_indirect_parameters_buffers) =
+        indirect_parameters_buffers.buffers.insert(
+            TypeId::of::<PI>(),
+            untyped_phase_indirect_parameters_buffers,
+        )
+    {
+        old_untyped_phase_indirect_parameters_buffers.clear();
+        phase_indirect_parameters_buffers.buffers = old_untyped_phase_indirect_parameters_buffers;
+    }
+}
+
 /// A system that writes all instance buffers to the GPU.
 pub fn write_batched_instance_buffers<GFBD>(
     render_device: Res<RenderDevice>,
@@ -1684,26 +1907,31 @@ pub fn write_batched_instance_buffers<GFBD>(
     GFBD: GetFullBatchData,
 {
     let BatchedInstanceBuffers {
-        ref mut data_buffer,
-        ref mut work_item_buffers,
         ref mut current_input_buffer,
         ref mut previous_input_buffer,
-        ref mut late_indexed_indirect_parameters_buffer,
-        ref mut late_non_indexed_indirect_parameters_buffer,
+        ref mut phase_instance_buffers,
     } = gpu_array_buffer.into_inner();
 
-    data_buffer.write_buffer(&render_device);
     current_input_buffer
         .buffer
         .write_buffer(&render_device, &render_queue);
     previous_input_buffer
         .buffer
         .write_buffer(&render_device, &render_queue);
-    late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
-    late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
 
-    for view_work_item_buffers in work_item_buffers.values_mut() {
-        for phase_work_item_buffers in view_work_item_buffers.values_mut() {
+    for phase_instance_buffers in phase_instance_buffers.values_mut() {
+        let UntypedPhaseBatchedInstanceBuffers {
+            ref mut data_buffer,
+            ref mut work_item_buffers,
+            ref mut late_indexed_indirect_parameters_buffer,
+            ref mut late_non_indexed_indirect_parameters_buffer,
+        } = *phase_instance_buffers;
+
+        data_buffer.write_buffer(&render_device);
+        late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
+        late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
+
+        for phase_work_item_buffers in work_item_buffers.values_mut() {
             match *phase_work_item_buffers {
                 PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => {
                     buffer_vec.write_buffer(&render_device, &render_queue);
@@ -1739,12 +1967,9 @@ pub fn write_batched_instance_buffers<GFBD>(
 pub fn clear_indirect_parameters_buffers(
     mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) {
-    indirect_parameters_buffers.indexed_data.clear();
-    indirect_parameters_buffers.indexed_metadata.clear();
-    indirect_parameters_buffers.indexed_batch_sets.clear();
-    indirect_parameters_buffers.non_indexed_data.clear();
-    indirect_parameters_buffers.non_indexed_metadata.clear();
-    indirect_parameters_buffers.non_indexed_batch_sets.clear();
+    for phase_indirect_parameters_buffers in indirect_parameters_buffers.buffers.values_mut() {
+        phase_indirect_parameters_buffers.clear();
+    }
 }
 
 pub fn write_indirect_parameters_buffers(
@@ -1752,26 +1977,28 @@ pub fn write_indirect_parameters_buffers(
     render_queue: Res<RenderQueue>,
     mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) {
-    indirect_parameters_buffers
-        .indexed_data
-        .write_buffer(&render_device);
-    indirect_parameters_buffers
-        .non_indexed_data
-        .write_buffer(&render_device);
-
-    indirect_parameters_buffers
-        .indexed_metadata
-        .write_buffer(&render_device, &render_queue);
-    indirect_parameters_buffers
-        .non_indexed_metadata
-        .write_buffer(&render_device, &render_queue);
-
-    indirect_parameters_buffers
-        .indexed_batch_sets
-        .write_buffer(&render_device, &render_queue);
-    indirect_parameters_buffers
-        .non_indexed_batch_sets
-        .write_buffer(&render_device, &render_queue);
+    for phase_indirect_parameters_buffers in indirect_parameters_buffers.buffers.values_mut() {
+        phase_indirect_parameters_buffers
+            .indexed_data
+            .write_buffer(&render_device);
+        phase_indirect_parameters_buffers
+            .non_indexed_data
+            .write_buffer(&render_device);
+
+        phase_indirect_parameters_buffers
+            .indexed_metadata
+            .write_buffer(&render_device, &render_queue);
+        phase_indirect_parameters_buffers
+            .non_indexed_metadata
+            .write_buffer(&render_device, &render_queue);
+
+        phase_indirect_parameters_buffers
+            .indexed_batch_sets
+            .write_buffer(&render_device, &render_queue);
+        phase_indirect_parameters_buffers
+            .non_indexed_batch_sets
+            .write_buffer(&render_device, &render_queue);
+    }
 }
 
 #[cfg(test)]
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 9569f2ce8c05f..5a548abe7a2f7 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -4,9 +4,9 @@ use bevy_ecs::{
     system::{ResMut, SystemParam, SystemParamItem},
 };
 use bytemuck::Pod;
+use gpu_preprocessing::UntypedPhaseIndirectParametersBuffers;
 use nonmax::NonMaxU32;
 
-use self::gpu_preprocessing::IndirectParametersBuffers;
 use crate::{render_phase::PhaseItemExtraIndex, sync_world::MainEntity};
 use crate::{
     render_phase::{
@@ -171,7 +171,7 @@ pub trait GetFullBatchData: GetBatchData {
         indexed: bool,
         base_output_index: u32,
         batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffers: &mut IndirectParametersBuffers,
+        indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
         indirect_parameters_offset: u32,
     );
 }
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index 050da078dc540..0661d1ea2b884 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -156,6 +156,9 @@ pub enum RenderSet {
     Prepare,
     /// A sub-set within [`Prepare`](RenderSet::Prepare) for initializing buffers, textures and uniforms for use in bind groups.
     PrepareResources,
+    /// Collect phase buffers after
+    /// [`PrepareResources`](RenderSet::PrepareResources) has run.
+    PrepareResourcesCollectPhaseBuffers,
     /// Flush buffers after [`PrepareResources`](RenderSet::PrepareResources), but before [`PrepareBindGroups`](RenderSet::PrepareBindGroups).
     PrepareResourcesFlush,
     /// A sub-set within [`Prepare`](RenderSet::Prepare) for constructing bind groups, or other data that relies on render resources prepared in [`PrepareResources`](RenderSet::PrepareResources).
@@ -206,7 +209,12 @@ impl Render {
                 .after(prepare_assets::<RenderMesh>),
         );
         schedule.configure_sets(
-            (PrepareResources, PrepareResourcesFlush, PrepareBindGroups)
+            (
+                PrepareResources,
+                PrepareResourcesCollectPhaseBuffers,
+                PrepareResourcesFlush,
+                PrepareBindGroups,
+            )
                 .chain()
                 .in_set(Prepare),
         );
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index e9baccf141dbe..0ef97eb5e5919 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -43,7 +43,10 @@ use nonmax::NonMaxU32;
 pub use rangefinder::*;
 use wgpu::Features;
 
-use crate::batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport};
+use crate::batching::gpu_preprocessing::{
+    GpuPreprocessingMode, GpuPreprocessingSupport, PhaseBatchedInstanceBuffers,
+    PhaseIndirectParametersBuffers,
+};
 use crate::renderer::RenderDevice;
 use crate::sync_world::{MainEntity, MainEntityHashMap};
 use crate::view::RetainedViewEntity;
@@ -1026,18 +1029,30 @@ impl UnbatchableBinnedEntityIndexSet {
 ///
 /// This is the version used when the pipeline supports GPU preprocessing: e.g.
 /// 3D PBR meshes.
-pub struct BinnedRenderPhasePlugin<BPI, GFBD>(PhantomData<(BPI, GFBD)>)
+pub struct BinnedRenderPhasePlugin<BPI, GFBD>
 where
     BPI: BinnedPhaseItem,
-    GFBD: GetFullBatchData;
+    GFBD: GetFullBatchData,
+{
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameters: bool,
+    phantom: PhantomData<(BPI, GFBD)>,
+}
 
-impl<BPI, GFBD> Default for BinnedRenderPhasePlugin<BPI, GFBD>
+impl<BPI, GFBD> BinnedRenderPhasePlugin<BPI, GFBD>
 where
     BPI: BinnedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    fn default() -> Self {
-        Self(PhantomData)
+    pub fn new(allow_copies_from_indirect_parameters: bool) -> Self {
+        Self {
+            allow_copies_from_indirect_parameters,
+            phantom: PhantomData,
+        }
     }
 }
 
@@ -1053,6 +1068,10 @@ where
 
         render_app
             .init_resource::<ViewBinnedRenderPhases<BPI>>()
+            .init_resource::<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>()
+            .insert_resource(PhaseIndirectParametersBuffers::<BPI>::new(
+                self.allow_copies_from_indirect_parameters,
+            ))
             .add_systems(
                 Render,
                 (
@@ -1068,6 +1087,13 @@ where
                             ),
                     )
                         .in_set(RenderSet::PrepareResources),
+                    gpu_preprocessing::collect_buffers_for_phase::<BPI, GFBD>
+                        .run_if(
+                            resource_exists::<
+                                BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                            >,
+                        )
+                        .in_set(RenderSet::PrepareResourcesCollectPhaseBuffers),
                 ),
             );
     }
@@ -1111,18 +1137,30 @@ where
 ///
 /// This is the version used when the pipeline supports GPU preprocessing: e.g.
 /// 3D PBR meshes.
-pub struct SortedRenderPhasePlugin<SPI, GFBD>(PhantomData<(SPI, GFBD)>)
+pub struct SortedRenderPhasePlugin<SPI, GFBD>
 where
     SPI: SortedPhaseItem,
-    GFBD: GetFullBatchData;
+    GFBD: GetFullBatchData,
+{
+    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// that they can be read back to CPU.
+    ///
+    /// This is a debugging feature that may reduce performance. It primarily
+    /// exists for the `occlusion_culling` example.
+    pub allow_copies_from_indirect_parameters: bool,
+    phantom: PhantomData<(SPI, GFBD)>,
+}
 
-impl<SPI, GFBD> Default for SortedRenderPhasePlugin<SPI, GFBD>
+impl<SPI, GFBD> SortedRenderPhasePlugin<SPI, GFBD>
 where
     SPI: SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    fn default() -> Self {
-        Self(PhantomData)
+    pub fn new(allow_copies_from_indirect_parameters: bool) -> Self {
+        Self {
+            allow_copies_from_indirect_parameters,
+            phantom: PhantomData,
+        }
     }
 }
 
@@ -1138,18 +1176,32 @@ where
 
         render_app
             .init_resource::<ViewSortedRenderPhases<SPI>>()
+            .init_resource::<PhaseBatchedInstanceBuffers<SPI, GFBD::BufferData>>()
+            .insert_resource(PhaseIndirectParametersBuffers::<SPI>::new(
+                self.allow_copies_from_indirect_parameters,
+            ))
             .add_systems(
                 Render,
                 (
-                    no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
-                        .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
-                    gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>.run_if(
-                        resource_exists::<
-                            BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-                        >,
-                    ),
-                )
-                    .in_set(RenderSet::PrepareResources),
+                    (
+                        no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
+                            .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
+                        gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
+                            .run_if(
+                                resource_exists::<
+                                    BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                                >,
+                            ),
+                    )
+                        .in_set(RenderSet::PrepareResources),
+                    gpu_preprocessing::collect_buffers_for_phase::<SPI, GFBD>
+                        .run_if(
+                            resource_exists::<
+                                BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                            >,
+                        )
+                        .in_set(RenderSet::PrepareResourcesCollectPhaseBuffers),
+                ),
             );
     }
 }
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 95fa6abd1cdc3..b616df7f32e32 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -469,7 +469,7 @@ impl GetFullBatchData for Mesh2dPipeline {
         indexed: bool,
         base_output_index: u32,
         batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffer: &mut bevy_render::batching::gpu_preprocessing::IndirectParametersBuffers,
+        indirect_parameters_buffer: &mut bevy_render::batching::gpu_preprocessing::UntypedPhaseIndirectParametersBuffers,
         indirect_parameters_offset: u32,
     ) {
         // Note that `IndirectParameters` covers both of these structures, even
diff --git a/examples/3d/occlusion_culling.rs b/examples/3d/occlusion_culling.rs
index 11bdde698a0bd..8442d3801cb6b 100644
--- a/examples/3d/occlusion_culling.rs
+++ b/examples/3d/occlusion_culling.rs
@@ -6,6 +6,7 @@
 //! the effects of occlusion culling can be seen.
 
 use std::{
+    any::TypeId,
     f32::consts::PI,
     fmt::Write as _,
     result::Result,
@@ -15,9 +16,13 @@ use std::{
 use bevy::{
     color::palettes::css::{SILVER, WHITE},
     core_pipeline::{
-        core_3d::graph::{Core3d, Node3d},
+        core_3d::{
+            graph::{Core3d, Node3d},
+            Opaque3d,
+        },
         prepass::DepthPrepass,
     },
+    pbr::PbrPlugin,
     prelude::*,
     render::{
         batching::gpu_preprocessing::{
@@ -185,6 +190,10 @@ fn main() {
                 .set(RenderPlugin {
                     allow_copies_from_indirect_parameters: true,
                     ..default()
+                })
+                .set(PbrPlugin {
+                    allow_copies_from_indirect_parameters: true,
+                    ..default()
                 }),
         )
         .add_plugins(ReadbackIndirectParametersPlugin)
@@ -421,6 +430,15 @@ impl render_graph::Node for ReadbackIndirectParametersNode {
             return Ok(());
         };
 
+        // Get the indirect parameters buffers corresponding to the opaque 3D
+        // phase, since all our meshes are in that phase.
+        let Some(phase_indirect_parameters_buffers) = indirect_parameters_buffers
+            .buffers
+            .get(&TypeId::of::<Opaque3d>())
+        else {
+            return Ok(());
+        };
+
         // Grab both the buffers we're copying from and the staging buffers
         // we're copying to. Remember that we can't map the indirect parameters
         // buffers directly, so we have to copy their contents to a staging
@@ -431,8 +449,8 @@ impl render_graph::Node for ReadbackIndirectParametersNode {
             Some(indirect_parameters_staging_data_buffer),
             Some(indirect_parameters_staging_batch_sets_buffer),
         ) = (
-            indirect_parameters_buffers.indexed_data_buffer(),
-            indirect_parameters_buffers.indexed_batch_sets_buffer(),
+            phase_indirect_parameters_buffers.indexed_data_buffer(),
+            phase_indirect_parameters_buffers.indexed_batch_sets_buffer(),
             indirect_parameters_mapping_buffers.data.as_ref(),
             indirect_parameters_mapping_buffers.batch_sets.as_ref(),
         )
@@ -474,10 +492,17 @@ fn create_indirect_parameters_staging_buffers(
     indirect_parameters_buffers: Res<IndirectParametersBuffers>,
     render_device: Res<RenderDevice>,
 ) {
+    let Some(phase_indirect_parameters_buffers) = indirect_parameters_buffers
+        .buffers
+        .get(&TypeId::of::<Opaque3d>())
+    else {
+        return;
+    };
+
     // Fetch the indirect parameters buffers that we're going to copy from.
     let (Some(indexed_data_buffer), Some(indexed_batch_set_buffer)) = (
-        indirect_parameters_buffers.indexed_data_buffer(),
-        indirect_parameters_buffers.indexed_batch_sets_buffer(),
+        phase_indirect_parameters_buffers.indexed_data_buffer(),
+        phase_indirect_parameters_buffers.indexed_batch_sets_buffer(),
     ) else {
         return;
     };
diff --git a/examples/shader/specialized_mesh_pipeline.rs b/examples/shader/specialized_mesh_pipeline.rs
index 12dbddb64b303..2b15f801a8cb3 100644
--- a/examples/shader/specialized_mesh_pipeline.rs
+++ b/examples/shader/specialized_mesh_pipeline.rs
@@ -16,12 +16,12 @@ use bevy::{
     },
     prelude::*,
     render::{
-        batching::GetFullBatchData,
         batching::{
             gpu_preprocessing::{
-                self, BatchedInstanceBuffers, IndirectParametersBuffers, PreprocessWorkItem,
+                self, PhaseBatchedInstanceBuffers, PhaseIndirectParametersBuffers,
+                PreprocessWorkItem, UntypedPhaseBatchedInstanceBuffers,
             },
-            GetBatchData,
+            GetBatchData, GetFullBatchData,
         },
         experimental::occlusion_culling::OcclusionCulling,
         extract_component::{ExtractComponent, ExtractComponentPlugin},
@@ -291,24 +291,21 @@ fn queue_custom_mesh_pipeline(
         Res<RenderMeshInstances>,
     ),
     param: StaticSystemParam<<MeshPipeline as GetBatchData>::Param>,
-    gpu_array_buffer: ResMut<
-        BatchedInstanceBuffers<
-            <MeshPipeline as GetBatchData>::BufferData,
-            <MeshPipeline as GetFullBatchData>::BufferInputData,
-        >,
+    mut phase_batched_instance_buffers: ResMut<
+        PhaseBatchedInstanceBuffers<Opaque3d, <MeshPipeline as GetBatchData>::BufferData>,
     >,
-    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
+    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<Opaque3d>>,
     mut change_tick: Local<Tick>,
 ) {
     let system_param_item = param.into_inner();
 
-    let BatchedInstanceBuffers {
+    let UntypedPhaseBatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ref mut late_indexed_indirect_parameters_buffer,
         ref mut late_non_indexed_indirect_parameters_buffer,
         ..
-    } = gpu_array_buffer.into_inner();
+    } = phase_batched_instance_buffers.buffers;
 
     // Get the id for our custom draw function
     let draw_function_id = opaque_draw_functions
@@ -378,7 +375,8 @@ fn queue_custom_mesh_pipeline(
             // batch set.
             if mesh_batch_set_info.is_none() {
                 mesh_batch_set_info = Some(MeshBatchSetInfo {
-                    indirect_parameters_index: indirect_parameters_buffers
+                    indirect_parameters_index: phase_indirect_parameters_buffers
+                        .buffers
                         .allocate(mesh.indexed(), 1),
                     is_indexed: mesh.indexed(),
                 });
@@ -449,7 +447,8 @@ fn queue_custom_mesh_pipeline(
         // indirect parameters buffer, so that the renderer will end up
         // enqueuing a command to draw the mesh.
         if let Some(mesh_info) = mesh_batch_set_info {
-            indirect_parameters_buffers
+            phase_indirect_parameters_buffers
+                .buffers
                 .add_batch_set(mesh_info.is_indexed, mesh_info.indirect_parameters_index);
         }
     }

From 1b10fbdf62812e4ebdf2270e94647ac549c9dc77 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 10 Feb 2025 18:05:29 -0800
Subject: [PATCH 2/4] Switch the ad-hoc boolean to a `RenderDebugFlags`
 bitfield

---
 crates/bevy_pbr/src/decal/forward.rs          |  3 +-
 crates/bevy_pbr/src/lib.rs                    | 18 +++-----
 crates/bevy_pbr/src/material.rs               | 14 ++-----
 crates/bevy_pbr/src/prepass/mod.rs            | 21 ++++------
 crates/bevy_pbr/src/render/mesh.rs            | 41 ++++++-------------
 crates/bevy_render/Cargo.toml                 |  1 +
 .../src/batching/gpu_preprocessing.rs         | 13 +++---
 crates/bevy_render/src/lib.rs                 | 24 +++++++----
 crates/bevy_render/src/render_phase/mod.rs    | 31 ++++++--------
 examples/3d/occlusion_culling.rs              |  7 +++-
 examples/shader/custom_render_phase.rs        |  3 +-
 11 files changed, 75 insertions(+), 101 deletions(-)

diff --git a/crates/bevy_pbr/src/decal/forward.rs b/crates/bevy_pbr/src/decal/forward.rs
index 4771ff1a5dac6..7732f1d3a4ab3 100644
--- a/crates/bevy_pbr/src/decal/forward.rs
+++ b/crates/bevy_pbr/src/decal/forward.rs
@@ -14,6 +14,7 @@ use bevy_render::{
         AsBindGroup, CompareFunction, RenderPipelineDescriptor, Shader,
         SpecializedMeshPipelineError,
     },
+    RenderDebugFlags,
 };
 
 const FORWARD_DECAL_MESH_HANDLE: Handle<Mesh> =
@@ -48,7 +49,7 @@ impl Plugin for ForwardDecalPlugin {
         app.add_plugins(MaterialPlugin::<ForwardDecalMaterial<StandardMaterial>> {
             prepass_enabled: false,
             shadows_enabled: false,
-            allow_copies_from_indirect_parameters: false,
+            debug_flags: RenderDebugFlags::default(),
             ..Default::default()
         });
     }
diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 787704c10282a..10997ab43eea2 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -125,7 +125,7 @@ use bevy_render::{
     sync_component::SyncComponentPlugin,
     texture::GpuImage,
     view::VisibilitySystems,
-    ExtractSchedule, Render, RenderApp, RenderSet,
+    ExtractSchedule, Render, RenderApp, RenderDebugFlags, RenderSet,
 };
 
 use bevy_transform::TransformSystem;
@@ -182,12 +182,8 @@ pub struct PbrPlugin {
     /// This requires compute shader support and so will be forcibly disabled if
     /// the platform doesn't support those.
     pub use_gpu_instance_buffer_builder: bool,
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
 }
 
 impl Default for PbrPlugin {
@@ -196,7 +192,7 @@ impl Default for PbrPlugin {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
             use_gpu_instance_buffer_builder: true,
-            allow_copies_from_indirect_parameters: false,
+            debug_flags: RenderDebugFlags::default(),
         }
     }
 }
@@ -340,13 +336,11 @@ impl Plugin for PbrPlugin {
             .add_plugins((
                 MeshRenderPlugin {
                     use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
-                    allow_copies_from_indirect_parameters: self
-                        .allow_copies_from_indirect_parameters,
+                    debug_flags: self.debug_flags,
                 },
                 MaterialPlugin::<StandardMaterial> {
                     prepass_enabled: self.prepass_enabled,
-                    allow_copies_from_indirect_parameters: self
-                        .allow_copies_from_indirect_parameters,
+                    debug_flags: self.debug_flags,
                     ..Default::default()
                 },
                 ScreenSpaceAmbientOcclusionPlugin,
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index dc92e9f7a54a8..503e498577d0d 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -252,12 +252,8 @@ pub struct MaterialPlugin<M: Material> {
     pub prepass_enabled: bool,
     /// Controls if shadows are enabled for the Material.
     pub shadows_enabled: bool,
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
     pub _marker: PhantomData<M>,
 }
 
@@ -266,7 +262,7 @@ impl<M: Material> Default for MaterialPlugin<M> {
         Self {
             prepass_enabled: true,
             shadows_enabled: true,
-            allow_copies_from_indirect_parameters: false,
+            debug_flags: RenderDebugFlags::default(),
             _marker: Default::default(),
         }
     }
@@ -381,9 +377,7 @@ where
         }
 
         if self.prepass_enabled {
-            app.add_plugins(PrepassPlugin::<M>::new(
-                self.allow_copies_from_indirect_parameters,
-            ));
+            app.add_plugins(PrepassPlugin::<M>::new(self.debug_flags));
         }
     }
 
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index d867b2f8491ae..0ec1953833450 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -19,7 +19,7 @@ use bevy_render::{
     renderer::RenderAdapter,
     sync_world::RenderEntity,
     view::{RenderVisibilityRanges, VISIBILITY_RANGES_STORAGE_BUFFER_COUNT},
-    ExtractSchedule, Render, RenderApp, RenderSet,
+    ExtractSchedule, Render, RenderApp, RenderDebugFlags, RenderSet,
 };
 pub use prepass_bindings::*;
 
@@ -147,19 +147,16 @@ where
 ///
 /// This depends on the [`PrepassPipelinePlugin`].
 pub struct PrepassPlugin<M: Material> {
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
     pub phantom: PhantomData<M>,
 }
 
 impl<M: Material> PrepassPlugin<M> {
-    pub fn new(allow_copies_from_indirect_parameters: bool) -> Self {
+    /// Creates a new [`PrepassPlugin`] with the given debug flags.
+    pub fn new(debug_flags: RenderDebugFlags) -> Self {
         PrepassPlugin {
-            allow_copies_from_indirect_parameters,
+            debug_flags,
             phantom: PhantomData,
         }
     }
@@ -187,11 +184,9 @@ where
                     ),
                 )
                 .add_plugins((
-                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::new(
-                        self.allow_copies_from_indirect_parameters,
-                    ),
+                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::new(self.debug_flags),
                     BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::new(
-                        self.allow_copies_from_indirect_parameters,
+                        self.debug_flags,
                     ),
                 ));
         }
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index a603bd34db67a..ace104250dbc7 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -86,19 +86,16 @@ pub struct MeshRenderPlugin {
     /// This requires compute shader support and so will be forcibly disabled if
     /// the platform doesn't support those.
     pub use_gpu_instance_buffer_builder: bool,
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
 }
 
 impl MeshRenderPlugin {
-    pub fn new(allow_copies_from_indirect_parameters: bool) -> MeshRenderPlugin {
+    /// Creates a new [`MeshRenderPlugin`] with the given debug flags.
+    pub fn new(debug_flags: RenderDebugFlags) -> MeshRenderPlugin {
         MeshRenderPlugin {
             use_gpu_instance_buffer_builder: false,
-            allow_copies_from_indirect_parameters,
+            debug_flags,
         }
     }
 }
@@ -181,27 +178,13 @@ impl Plugin for MeshRenderPlugin {
             (no_automatic_skin_batching, no_automatic_morph_batching),
         )
         .add_plugins((
-            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
-            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
-            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
-            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
-            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
-            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
-            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::new(
-                self.allow_copies_from_indirect_parameters,
-            ),
+            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::new(self.debug_flags),
+            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::new(self.debug_flags),
+            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::new(self.debug_flags),
+            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::new(self.debug_flags),
+            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::new(self.debug_flags),
+            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::new(self.debug_flags),
+            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::new(self.debug_flags),
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
diff --git a/crates/bevy_render/Cargo.toml b/crates/bevy_render/Cargo.toml
index 1f349202b6005..38424dbe94e8d 100644
--- a/crates/bevy_render/Cargo.toml
+++ b/crates/bevy_render/Cargo.toml
@@ -102,6 +102,7 @@ variadics_please = "1.1"
 tracing = { version = "0.1", default-features = false, features = ["std"] }
 indexmap = { version = "2" }
 fixedbitset = { version = "0.5" }
+bitflags = "2"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 # Omit the `glsl` feature in non-WebAssembly by default.
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 2e8cf0208f1e8..6fed854dd2788 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -31,19 +31,15 @@ use crate::{
     render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
     renderer::{RenderAdapter, RenderDevice, RenderQueue},
     view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
-    Render, RenderApp, RenderSet,
+    Render, RenderApp, RenderDebugFlags, RenderSet,
 };
 
 use super::{BatchMeta, GetBatchData, GetFullBatchData};
 
 #[derive(Default)]
 pub struct BatchingPlugin {
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
 }
 
 impl Plugin for BatchingPlugin {
@@ -54,7 +50,8 @@ impl Plugin for BatchingPlugin {
 
         render_app
             .insert_resource(IndirectParametersBuffers::new(
-                self.allow_copies_from_indirect_parameters,
+                self.debug_flags
+                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
             ))
             .add_systems(
                 Render,
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index 2bc14949aaf6d..76e1f1f0b7619 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -102,6 +102,7 @@ use alloc::sync::Arc;
 use bevy_app::{App, AppLabel, Plugin, SubApp};
 use bevy_asset::{load_internal_asset, weak_handle, AssetApp, AssetServer, Handle};
 use bevy_ecs::{prelude::*, schedule::ScheduleLabel};
+use bitflags::bitflags;
 use core::ops::{Deref, DerefMut};
 use std::sync::Mutex;
 use tracing::debug;
@@ -120,12 +121,21 @@ pub struct RenderPlugin {
     /// If `true`, disables asynchronous pipeline compilation.
     /// This has no effect on macOS, Wasm, iOS, or without the `multi_threaded` feature.
     pub synchronous_pipeline_compilation: bool,
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
+}
+
+bitflags! {
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    #[derive(Clone, Copy, PartialEq, Default, Debug)]
+    pub struct RenderDebugFlags: u8 {
+        /// If true, this sets the `COPY_SRC` flag on indirect draw parameters
+        /// so that they can be read back to CPU.
+        ///
+        /// This is a debugging feature that may reduce performance. It
+        /// primarily exists for the `occlusion_culling` example.
+        const ALLOW_COPIES_FROM_INDIRECT_PARAMETERS = 1;
+    }
 }
 
 /// The systems sets of the default [`App`] rendering schedule.
@@ -388,7 +398,7 @@ impl Plugin for RenderPlugin {
             GlobalsPlugin,
             MorphPlugin,
             BatchingPlugin {
-                allow_copies_from_indirect_parameters: self.allow_copies_from_indirect_parameters,
+                debug_flags: self.debug_flags,
             },
             SyncWorldPlugin,
             StoragePlugin,
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 4c4bbb31b3af2..ddede356802e9 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -50,6 +50,7 @@ use crate::batching::gpu_preprocessing::{
 use crate::renderer::RenderDevice;
 use crate::sync_world::{MainEntity, MainEntityHashMap};
 use crate::view::RetainedViewEntity;
+use crate::RenderDebugFlags;
 use crate::{
     batching::{
         self,
@@ -1032,12 +1033,8 @@ where
     BPI: BinnedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
     phantom: PhantomData<(BPI, GFBD)>,
 }
 
@@ -1046,9 +1043,9 @@ where
     BPI: BinnedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    pub fn new(allow_copies_from_indirect_parameters: bool) -> Self {
+    pub fn new(debug_flags: RenderDebugFlags) -> Self {
         Self {
-            allow_copies_from_indirect_parameters,
+            debug_flags,
             phantom: PhantomData,
         }
     }
@@ -1068,7 +1065,8 @@ where
             .init_resource::<ViewBinnedRenderPhases<BPI>>()
             .init_resource::<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>()
             .insert_resource(PhaseIndirectParametersBuffers::<BPI>::new(
-                self.allow_copies_from_indirect_parameters,
+                self.debug_flags
+                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
             ))
             .add_systems(
                 Render,
@@ -1141,12 +1139,8 @@ where
     SPI: SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
-    /// that they can be read back to CPU.
-    ///
-    /// This is a debugging feature that may reduce performance. It primarily
-    /// exists for the `occlusion_culling` example.
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
+    pub debug_flags: RenderDebugFlags,
     phantom: PhantomData<(SPI, GFBD)>,
 }
 
@@ -1155,9 +1149,9 @@ where
     SPI: SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    pub fn new(allow_copies_from_indirect_parameters: bool) -> Self {
+    pub fn new(debug_flags: RenderDebugFlags) -> Self {
         Self {
-            allow_copies_from_indirect_parameters,
+            debug_flags,
             phantom: PhantomData,
         }
     }
@@ -1177,7 +1171,8 @@ where
             .init_resource::<ViewSortedRenderPhases<SPI>>()
             .init_resource::<PhaseBatchedInstanceBuffers<SPI, GFBD::BufferData>>()
             .insert_resource(PhaseIndirectParametersBuffers::<SPI>::new(
-                self.allow_copies_from_indirect_parameters,
+                self.debug_flags
+                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
             ))
             .add_systems(
                 Render,
diff --git a/examples/3d/occlusion_culling.rs b/examples/3d/occlusion_culling.rs
index 8442d3801cb6b..76ef472f92286 100644
--- a/examples/3d/occlusion_culling.rs
+++ b/examples/3d/occlusion_culling.rs
@@ -37,6 +37,7 @@ use bevy::{
         Render, RenderApp, RenderPlugin, RenderSet,
     },
 };
+use bevy_render::RenderDebugFlags;
 use bytemuck::Pod;
 
 /// The radius of the spinning sphere of cubes.
@@ -177,6 +178,8 @@ impl Default for AppStatus {
 }
 
 fn main() {
+    let render_debug_flags = RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS;
+
     App::new()
         .add_plugins(
             DefaultPlugins
@@ -188,11 +191,11 @@ fn main() {
                     ..default()
                 })
                 .set(RenderPlugin {
-                    allow_copies_from_indirect_parameters: true,
+                    debug_flags: render_debug_flags,
                     ..default()
                 })
                 .set(PbrPlugin {
-                    allow_copies_from_indirect_parameters: true,
+                    debug_flags: render_debug_flags,
                     ..default()
                 }),
         )
diff --git a/examples/shader/custom_render_phase.rs b/examples/shader/custom_render_phase.rs
index 399a4c758d895..a2c1a30f5c74f 100644
--- a/examples/shader/custom_render_phase.rs
+++ b/examples/shader/custom_render_phase.rs
@@ -56,6 +56,7 @@ use bevy::{
         Extract, Render, RenderApp, RenderSet,
     },
 };
+use bevy_render::batching::gpu_preprocessing::UntypedPhaseIndirectParametersBuffers;
 use nonmax::NonMaxU32;
 
 const SHADER_ASSET_PATH: &str = "shaders/custom_stencil.wgsl";
@@ -435,7 +436,7 @@ impl GetFullBatchData for StencilPipeline {
         indexed: bool,
         base_output_index: u32,
         batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffers: &mut bevy_render::batching::gpu_preprocessing::IndirectParametersBuffers,
+        indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
         indirect_parameters_offset: u32,
     ) {
         // Note that `IndirectParameters` covers both of these structures, even

From 8cb3cdf21effd5c8c95d5e74ac431f5f078fa8ca Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 10 Feb 2025 18:08:21 -0800
Subject: [PATCH 3/4] Internal import police

---
 examples/3d/occlusion_culling.rs       | 3 +--
 examples/shader/custom_render_phase.rs | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/3d/occlusion_culling.rs b/examples/3d/occlusion_culling.rs
index 76ef472f92286..f799d89e4ca1a 100644
--- a/examples/3d/occlusion_culling.rs
+++ b/examples/3d/occlusion_culling.rs
@@ -34,10 +34,9 @@ use bevy::{
         render_resource::{Buffer, BufferDescriptor, BufferUsages, MapMode},
         renderer::{RenderAdapter, RenderContext, RenderDevice},
         settings::WgpuFeatures,
-        Render, RenderApp, RenderPlugin, RenderSet,
+        Render, RenderApp, RenderDebugFlags, RenderPlugin, RenderSet,
     },
 };
-use bevy_render::RenderDebugFlags;
 use bytemuck::Pod;
 
 /// The radius of the spinning sphere of cubes.
diff --git a/examples/shader/custom_render_phase.rs b/examples/shader/custom_render_phase.rs
index a2c1a30f5c74f..31a9ddb37db43 100644
--- a/examples/shader/custom_render_phase.rs
+++ b/examples/shader/custom_render_phase.rs
@@ -29,6 +29,7 @@ use bevy::{
         batching::{
             gpu_preprocessing::{
                 batch_and_prepare_sorted_render_phase, IndirectParametersMetadata,
+                UntypedPhaseIndirectParametersBuffers,
             },
             GetBatchData, GetFullBatchData,
         },
@@ -56,7 +57,6 @@ use bevy::{
         Extract, Render, RenderApp, RenderSet,
     },
 };
-use bevy_render::batching::gpu_preprocessing::UntypedPhaseIndirectParametersBuffers;
 use nonmax::NonMaxU32;
 
 const SHADER_ASSET_PATH: &str = "shaders/custom_stencil.wgsl";

From bb7299cfd88da2f144bc923b755fe010fdbca60e Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 11 Feb 2025 22:19:56 -0800
Subject: [PATCH 4/4] Use `Deref` for `IndirectParametersBuffers`

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs         |  6 +++---
 crates/bevy_pbr/src/render/mesh.rs                   |  4 ++--
 crates/bevy_render/src/batching/gpu_preprocessing.rs | 12 +++++++-----
 examples/3d/occlusion_culling.rs                     | 10 ++++------
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 58ef93d906c8b..495b6b4112f00 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -1060,7 +1060,7 @@ fn run_build_indirect_parameters_node(
         build_indirect_params_bind_groups.iter()
     {
         let Some(phase_indirect_parameters_buffers) =
-            indirect_parameters_buffers.buffers.get(phase_type_id)
+            indirect_parameters_buffers.get(phase_type_id)
         else {
             continue;
         };
@@ -1709,7 +1709,7 @@ pub fn prepare_preprocess_bind_groups(
 
             // Grab the indirect parameters buffers for this phase.
             let Some(phase_indirect_parameters_buffers) =
-                indirect_parameters_buffers.buffers.get(phase_type_id)
+                indirect_parameters_buffers.get(phase_type_id)
             else {
                 continue;
             };
@@ -2403,7 +2403,7 @@ fn create_build_indirect_parameters_bind_groups(
 ) {
     let mut build_indirect_parameters_bind_groups = BuildIndirectParametersBindGroups::new();
 
-    for (phase_type_id, phase_indirect_parameters_buffer) in &indirect_parameters_buffers.buffers {
+    for (phase_type_id, phase_indirect_parameters_buffer) in indirect_parameters_buffers.iter() {
         build_indirect_parameters_bind_groups.insert(
             *phase_type_id,
             PhaseBuildIndirectParametersBindGroups {
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 78b11ea23d179..6690724fb0e6d 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -3077,7 +3077,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                         // the buffer we're going to use for
                         // `multi_draw_indexed_indirect_count` (if available).
                         let Some(phase_indirect_parameters_buffers) =
-                            indirect_parameters_buffer.buffers.get(&TypeId::of::<P>())
+                            indirect_parameters_buffer.get(&TypeId::of::<P>())
                         else {
                             warn!(
                                 "Not rendering mesh because indexed indirect parameters buffer \
@@ -3143,7 +3143,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                     // buffer we're going to use for
                     // `multi_draw_indirect_count` (if available).
                     let Some(phase_indirect_parameters_buffers) =
-                        indirect_parameters_buffer.buffers.get(&TypeId::of::<P>())
+                        indirect_parameters_buffer.get(&TypeId::of::<P>())
                     else {
                         warn!(
                             "Not rendering mesh because indexed indirect parameters buffer \
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 8586519eb6fb3..1891de79f8fdd 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -3,6 +3,7 @@
 use core::{any::TypeId, marker::PhantomData, mem};
 
 use bevy_app::{App, Plugin};
+use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
     prelude::Entity,
     query::{Has, With},
@@ -756,12 +757,13 @@ pub struct IndirectBatchSet {
 /// pass can determine how many meshes are actually to be drawn.
 ///
 /// These buffers will remain empty if indirect drawing isn't in use.
-#[derive(Resource)]
+#[derive(Resource, Deref, DerefMut)]
 pub struct IndirectParametersBuffers {
     /// A mapping from a phase type ID to the indirect parameters buffers for
     /// that phase.
     ///
     /// Examples of phase type IDs are `Opaque3d` and `AlphaMask3d`.
+    #[deref]
     pub buffers: TypeIdMap<UntypedPhaseIndirectParametersBuffers>,
     /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
     /// that they can be read back to CPU.
@@ -1875,8 +1877,8 @@ pub fn collect_buffers_for_phase<PI, GFBD>(
             indirect_parameters_buffers.allow_copies_from_indirect_parameter_buffers,
         ),
     );
-    if let Some(mut old_untyped_phase_indirect_parameters_buffers) =
-        indirect_parameters_buffers.buffers.insert(
+    if let Some(mut old_untyped_phase_indirect_parameters_buffers) = indirect_parameters_buffers
+        .insert(
             TypeId::of::<PI>(),
             untyped_phase_indirect_parameters_buffers,
         )
@@ -1955,7 +1957,7 @@ pub fn write_batched_instance_buffers<GFBD>(
 pub fn clear_indirect_parameters_buffers(
     mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) {
-    for phase_indirect_parameters_buffers in indirect_parameters_buffers.buffers.values_mut() {
+    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
         phase_indirect_parameters_buffers.clear();
     }
 }
@@ -1965,7 +1967,7 @@ pub fn write_indirect_parameters_buffers(
     render_queue: Res<RenderQueue>,
     mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) {
-    for phase_indirect_parameters_buffers in indirect_parameters_buffers.buffers.values_mut() {
+    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
         phase_indirect_parameters_buffers
             .indexed_data
             .write_buffer(&render_device);
diff --git a/examples/3d/occlusion_culling.rs b/examples/3d/occlusion_culling.rs
index f799d89e4ca1a..767875e86108d 100644
--- a/examples/3d/occlusion_culling.rs
+++ b/examples/3d/occlusion_culling.rs
@@ -434,9 +434,8 @@ impl render_graph::Node for ReadbackIndirectParametersNode {
 
         // Get the indirect parameters buffers corresponding to the opaque 3D
         // phase, since all our meshes are in that phase.
-        let Some(phase_indirect_parameters_buffers) = indirect_parameters_buffers
-            .buffers
-            .get(&TypeId::of::<Opaque3d>())
+        let Some(phase_indirect_parameters_buffers) =
+            indirect_parameters_buffers.get(&TypeId::of::<Opaque3d>())
         else {
             return Ok(());
         };
@@ -494,9 +493,8 @@ fn create_indirect_parameters_staging_buffers(
     indirect_parameters_buffers: Res<IndirectParametersBuffers>,
     render_device: Res<RenderDevice>,
 ) {
-    let Some(phase_indirect_parameters_buffers) = indirect_parameters_buffers
-        .buffers
-        .get(&TypeId::of::<Opaque3d>())
+    let Some(phase_indirect_parameters_buffers) =
+        indirect_parameters_buffers.get(&TypeId::of::<Opaque3d>())
     else {
         return;
     };