bevyengine · alice-i-cecile · Sep 8, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 3, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -1075,7 +1075,7 @@ setup = [
     "curl",
     "-o",
     "assets/models/bunny.meshlet_mesh",
-    "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/b6c712cfc87c65de419f856845401aba336a7bcd/bunny.meshlet_mesh",
+    "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh",
   ],
 ]
 

diff --git a/crates/bevy_pbr/src/meshlet/from_mesh.rs b/crates/bevy_pbr/src/meshlet/from_mesh.rs
@@ -7,7 +7,7 @@ use bevy_utils::HashMap;
 use itertools::Itertools;
 use meshopt::{
     build_meshlets, compute_cluster_bounds, compute_meshlet_bounds, ffi::meshopt_Bounds, simplify,
-    simplify_scale, Meshlets, SimplifyOptions, VertexDataAdapter,
+    Meshlets, SimplifyOptions, VertexDataAdapter,
 };
 use metis::Graph;
 use smallvec::SmallVec;
@@ -49,11 +49,9 @@ impl MeshletMesh {
                 },
             })
             .collect::<Vec<_>>();
-        let mesh_scale = simplify_scale(&vertices);
 
         // Build further LODs
         let mut simplification_queue = 0..meshlets.len();
-        let mut lod_level = 1;
         while simplification_queue.len() > 1 {
             // For each meshlet build a list of connected meshlets (meshlets that share a triangle edge)
             let connected_meshlets_per_meshlet =
@@ -70,19 +68,14 @@ impl MeshletMesh {
 
             for group_meshlets in groups.into_iter().filter(|group| group.len() > 1) {
                 // Simplify the group to ~50% triangle count
-                let Some((simplified_group_indices, mut group_error)) = simplify_meshlet_groups(
-                    &group_meshlets,
-                    &meshlets,
-                    &vertices,
-                    lod_level,
-                    mesh_scale,
-                ) else {
+                let Some((simplified_group_indices, mut group_error)) =
+                    simplify_meshlet_group(&group_meshlets, &meshlets, &vertices)
+                else {
                     continue;
                 };
 
-                // Add the maximum child error to the parent error to make parent error cumulative from LOD 0
-                // (we're currently building the parent from its children)
-                group_error += group_meshlets.iter().fold(0.0f32, |acc, meshlet_id| {
+                // Force parent error to be >= child error (we're currently building the parent from its children)
+                group_error = group_meshlets.iter().fold(group_error, |acc, meshlet_id| {
                     acc.max(bounding_spheres[*meshlet_id].self_lod.radius)
                 });
 
@@ -99,7 +92,7 @@ impl MeshletMesh {
                 }
 
                 // Build new meshlets using the simplified group
-                let new_meshlets_count = split_simplified_groups_into_new_meshlets(
+                let new_meshlets_count = split_simplified_group_into_new_meshlets(
                     &simplified_group_indices,
                     &vertices,
                     &mut meshlets,
@@ -125,7 +118,6 @@ impl MeshletMesh {
             }
 
             simplification_queue = next_lod_start..meshlets.len();
-            lod_level += 1;
         }
 
         // Convert meshopt_Meshlet data to a custom format
@@ -172,7 +164,7 @@ fn validate_input_mesh(mesh: &Mesh) -> Result<Cow<'_, [u32]>, MeshToMeshletMeshC
 }
 
 fn compute_meshlets(indices: &[u32], vertices: &VertexDataAdapter) -> Meshlets {
-    build_meshlets(indices, vertices, 64, 64, 0.0)
+    build_meshlets(indices, vertices, 255, 128, 0.0) // Meshoptimizer won't currently let us do 256 vertices
 }
 
 fn find_connected_meshlets(
@@ -252,7 +244,7 @@ fn group_meshlets(
     xadj.push(adjncy.len() as i32);
 
     let mut group_per_meshlet = vec![0; simplification_queue.len()];
-    let partition_count = simplification_queue.len().div_ceil(4);
+    let partition_count = simplification_queue.len().div_ceil(4); // TODO: Nanite uses groups of 8-32, probably based on some kind of heuristic
     Graph::new(1, partition_count as i32, &xadj, &adjncy)
         .unwrap()
         .set_adjwgt(&adjwgt)
@@ -267,12 +259,10 @@ fn group_meshlets(
     groups
 }
 
-fn simplify_meshlet_groups(
+fn simplify_meshlet_group(
     group_meshlets: &[usize],
     meshlets: &Meshlets,
     vertices: &VertexDataAdapter<'_>,
-    lod_level: u32,
-    mesh_scale: f32,
 ) -> Option<(Vec<u32>, f32)> {
     // Build a new index buffer into the mesh vertex data by combining all meshlet data in the group
     let mut group_indices = Vec::new();
@@ -283,25 +273,20 @@ fn simplify_meshlet_groups(
         }
     }
 
-    // Allow more deformation for high LOD levels (1% at LOD 1, 10% at LOD 20+)
-    let t = (lod_level - 1) as f32 / 19.0;
-    let target_error_relative = 0.1 * t + 0.01 * (1.0 - t);
-    let target_error = target_error_relative * mesh_scale;
-
     // Simplify the group to ~50% triangle count
     // TODO: Simplify using vertex attributes
     let mut error = 0.0;
     let simplified_group_indices = simplify(
         &group_indices,
         vertices,
         group_indices.len() / 2,
-        target_error,
-        SimplifyOptions::LockBorder | SimplifyOptions::Sparse | SimplifyOptions::ErrorAbsolute,
+        f32::MAX,
+        SimplifyOptions::LockBorder | SimplifyOptions::Sparse | SimplifyOptions::ErrorAbsolute, // TODO: Specify manual vertex locks instead of meshopt's overly-strict locks
         Some(&mut error),
     );
 
-    // Check if we were able to simplify to at least 65% triangle count
-    if simplified_group_indices.len() as f32 / group_indices.len() as f32 > 0.65 {
+    // Check if we were able to simplify at least a little (95% of the original triangle count)
+    if simplified_group_indices.len() as f32 / group_indices.len() as f32 > 0.95 {
         return None;
     }
 
@@ -311,7 +296,7 @@ fn simplify_meshlet_groups(
     Some((simplified_group_indices, error))
 }
 
-fn split_simplified_groups_into_new_meshlets(
+fn split_simplified_group_into_new_meshlets(
     simplified_group_indices: &[u32],
     vertices: &VertexDataAdapter<'_>,
     meshlets: &mut Meshlets,

diff --git a/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl b/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl
@@ -31,7 +31,7 @@ fn resolve_material_depth(in: FullscreenVertexOutput) -> @builtin(frag_depth) f3
     let depth = visibility >> 32u;
     if depth == 0lu { return 0.0; }
 
-    let cluster_id = u32(visibility) >> 6u;
+    let cluster_id = u32(visibility) >> 7u;
     let instance_id = meshlet_cluster_instance_ids[cluster_id];
     let material_id = meshlet_instance_material_ids[instance_id];
     return f32(material_id) / 65535.0;

diff --git a/crates/bevy_pbr/src/meshlet/resource_manager.rs b/crates/bevy_pbr/src/meshlet/resource_manager.rs
@@ -63,7 +63,7 @@ pub struct ResourceManager {
 impl ResourceManager {
     pub fn new(cluster_buffer_slots: u32, render_device: &RenderDevice) -> Self {
         let needs_dispatch_remap =
-            cluster_buffer_slots < render_device.limits().max_compute_workgroups_per_dimension;
+            cluster_buffer_slots > render_device.limits().max_compute_workgroups_per_dimension;
 
         Self {
             visibility_buffer_raster_clusters: render_device.create_buffer(&BufferDescriptor {
@@ -472,7 +472,7 @@ pub fn prepare_meshlet_per_frame_resources(
             .create_buffer_with_data(&BufferInitDescriptor {
                 label: Some("meshlet_visibility_buffer_hardware_raster_indirect_args_first"),
                 contents: DrawIndirectArgs {
-                    vertex_count: 64 * 3,
+                    vertex_count: 128 * 3,
                     instance_count: 0,
                     first_vertex: 0,
                     first_instance: 0,
@@ -484,7 +484,7 @@ pub fn prepare_meshlet_per_frame_resources(
             .create_buffer_with_data(&BufferInitDescriptor {
                 label: Some("visibility_buffer_hardware_raster_indirect_args_second"),
                 contents: DrawIndirectArgs {
-                    vertex_count: 64 * 3,
+                    vertex_count: 128 * 3,
                     instance_count: 0,
                     first_vertex: 0,
                     first_instance: 0,

diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl
@@ -56,7 +56,7 @@ fn vertex(@builtin(instance_index) instance_index: u32, @builtin(vertex_index) v
     return VertexOutput(
         clip_position,
 #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
-        (cluster_id << 6u) | triangle_id,
+        (cluster_id << 7u) | triangle_id,
 #endif
 #ifdef DEPTH_CLAMP_ORTHO
         unclamped_clip_depth,
@@ -83,7 +83,7 @@ fn fragment(vertex_output: VertexOutput) {
 
 fn dummy_vertex() -> VertexOutput {
     return VertexOutput(
-        vec4(0.0),
+        vec4(divide(0.0, 0.0)), // NaN vertex position
 #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
         0u,
 #endif
@@ -92,3 +92,8 @@ fn dummy_vertex() -> VertexOutput {
 #endif
     );
 }
+
+// Naga doesn't allow divide by zero literals, but this lets us work around it
+fn divide(a: f32, b: f32) -> f32 {
+    return a / b;
+}
diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl
@@ -97,11 +97,11 @@ struct VertexOutput {
 fn resolve_vertex_output(frag_coord: vec4<f32>) -> VertexOutput {
     let frag_coord_1d = u32(frag_coord.y) * u32(view.viewport.z) + u32(frag_coord.x);
     let packed_ids = u32(meshlet_visibility_buffer[frag_coord_1d]); // TODO: Might be faster to load the correct u32 directly
-    let cluster_id = packed_ids >> 6u;
+    let cluster_id = packed_ids >> 7u;
     let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id];
     let meshlet = meshlets[meshlet_id];
 
-    let triangle_id = extractBits(packed_ids, 0u, 6u);
+    let triangle_id = extractBits(packed_ids, 0u, 7u);
     let index_ids = meshlet.start_index_id + (triangle_id * 3u) + vec3(0u, 1u, 2u);
     let indices = meshlet.start_vertex_id + vec3(get_meshlet_index(index_ids.x), get_meshlet_index(index_ids.y), get_meshlet_index(index_ids.z));
     let vertex_ids = vec3(meshlet_vertex_ids[indices.x], meshlet_vertex_ids[indices.y], meshlet_vertex_ids[indices.z]);

diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl
@@ -22,10 +22,10 @@
 
 // TODO: Subpixel precision and top-left rule
 
-var<workgroup> viewport_vertices: array<vec3f, 64>;
+var<workgroup> viewport_vertices: array<vec3f, 255>;
 
 @compute
-@workgroup_size(64, 1, 1) // 64 threads per workgroup, 1 vertex/triangle per thread, 1 cluster per workgroup
+@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1-2 vertices per thread, 1 triangle per thread, 1 cluster per workgroup
 fn rasterize_cluster(
     @builtin(workgroup_id) workgroup_id: vec3<u32>,
     @builtin(local_invocation_index) local_invocation_index: u32,
@@ -44,28 +44,30 @@ fn rasterize_cluster(
     let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id];
     let meshlet = meshlets[meshlet_id];
 
-    // Load and project 1 vertex per thread
-    let vertex_id = local_invocation_index;
-    if vertex_id < meshlet.vertex_count {
-        let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id];
-        let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]);
-
-        // Project vertex to viewport space
-        let instance_id = meshlet_cluster_instance_ids[cluster_id];
-        let instance_uniform = meshlet_instance_uniforms[instance_id];
-        let world_from_local = affine3_to_square(instance_uniform.world_from_local);
-        let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0));
-        var clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0);
-        var ndc_position = clip_position.xyz / clip_position.w;
+    let instance_id = meshlet_cluster_instance_ids[cluster_id];
+    let instance_uniform = meshlet_instance_uniforms[instance_id];
+    let world_from_local = affine3_to_square(instance_uniform.world_from_local);
+
+    // Load and project 1 vertex per thread, and then again if there are more than 128 vertices in the meshlet
+    for (var i = 0u; i <= 128u; i += 128u) {
+        let vertex_id = local_invocation_index + i;
+        if vertex_id < meshlet.vertex_count {
+            let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id];
+            let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]);
+
+            // Project vertex to viewport space
+            let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0));
+            let clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0);
+            var ndc_position = clip_position.xyz / clip_position.w;
 #ifdef DEPTH_CLAMP_ORTHO
-        ndc_position.z = 1.0 / clip_position.z;
+            ndc_position.z = 1.0 / clip_position.z;
 #endif
-        let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw;
+            let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw;
 
-        // Write vertex to workgroup shared memory
-        viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z);
+            // Write vertex to workgroup shared memory
+            viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z);
+        }
     }
-
     workgroupBarrier();
 
     // Load 1 triangle's worth of vertex data per thread
@@ -76,7 +78,7 @@ fn rasterize_cluster(
     let vertex_0 = viewport_vertices[vertex_ids[2]];
     let vertex_1 = viewport_vertices[vertex_ids[1]];
     let vertex_2 = viewport_vertices[vertex_ids[0]];
-    let packed_ids = (cluster_id << 6u) | triangle_id;
+    let packed_ids = (cluster_id << 7u) | triangle_id;
 
     // Compute triangle bounding box
     let min_x = u32(min3(vertex_0.x, vertex_1.x, vertex_2.x));

diff --git a/examples/3d/meshlet.rs b/examples/3d/meshlet.rs
@@ -17,7 +17,7 @@ use camera_controller::{CameraController, CameraControllerPlugin};
 use std::{f32::consts::PI, path::Path, process::ExitCode};
 
 const ASSET_URL: &str =
-    "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/10bb5471c7beedfe63ad1cf269599c92b0f10aa2/bunny.meshlet_mesh";
+    "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh";
 
 fn main() -> ExitCode {
     if !Path::new("./assets/models/bunny.meshlet_mesh").exists() {
-Original file line number
+Diff line change
@@ Expand Up / @@ -1075,7 +1075,7 @@ setup = [ @@
         "curl",
         "-o",
         "assets/models/bunny.meshlet_mesh",
-        "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/b6c712cfc87c65de419f856845401aba336a7bcd/bunny.meshlet_mesh",
+        "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh",
       ],
     ]
@@ Expand Down @@