-
-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
More triangles/vertices per meshlet #15023
Changes from all commits
f565cd5
559535e
0866f4a
852923e
e0180b9
8bc76c6
ffb3d3a
e9f3029
ab4c870
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,10 +22,10 @@ | |
|
||
// TODO: Subpixel precision and top-left rule | ||
|
||
var<workgroup> viewport_vertices: array<vec3f, 64>; | ||
var<workgroup> viewport_vertices: array<vec3f, 255>; | ||
|
||
@compute | ||
@workgroup_size(64, 1, 1) // 64 threads per workgroup, 1 vertex/triangle per thread, 1 cluster per workgroup | ||
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1-2 vertices per thread, 1 triangle per thread, 1 cluster per workgroup | ||
fn rasterize_cluster( | ||
@builtin(workgroup_id) workgroup_id: vec3<u32>, | ||
@builtin(local_invocation_index) local_invocation_index: u32, | ||
|
@@ -44,28 +44,30 @@ fn rasterize_cluster( | |
let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; | ||
let meshlet = meshlets[meshlet_id]; | ||
|
||
// Load and project 1 vertex per thread | ||
let vertex_id = local_invocation_index; | ||
if vertex_id < meshlet.vertex_count { | ||
let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id]; | ||
let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]); | ||
|
||
// Project vertex to viewport space | ||
let instance_id = meshlet_cluster_instance_ids[cluster_id]; | ||
let instance_uniform = meshlet_instance_uniforms[instance_id]; | ||
let world_from_local = affine3_to_square(instance_uniform.world_from_local); | ||
let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0)); | ||
var clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0); | ||
var ndc_position = clip_position.xyz / clip_position.w; | ||
let instance_id = meshlet_cluster_instance_ids[cluster_id]; | ||
let instance_uniform = meshlet_instance_uniforms[instance_id]; | ||
let world_from_local = affine3_to_square(instance_uniform.world_from_local); | ||
|
||
// Load and project 1 vertex per thread, and then again if there are more than 128 vertices in the meshlet | ||
for (var i = 0u; i <= 128u; i += 128u) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: up to you but it feels like for (i = 0; i < max_vertices; i += workgroup_size) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, not quite the way I thought of it. My thought process was "we want to do this twice, with the second time using an offset of 128". |
||
let vertex_id = local_invocation_index + i; | ||
if vertex_id < meshlet.vertex_count { | ||
let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id]; | ||
let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]); | ||
|
||
// Project vertex to viewport space | ||
let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0)); | ||
let clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0); | ||
var ndc_position = clip_position.xyz / clip_position.w; | ||
#ifdef DEPTH_CLAMP_ORTHO | ||
ndc_position.z = 1.0 / clip_position.z; | ||
ndc_position.z = 1.0 / clip_position.z; | ||
#endif | ||
let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw; | ||
let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw; | ||
|
||
// Write vertex to workgroup shared memory | ||
viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z); | ||
// Write vertex to workgroup shared memory | ||
viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z); | ||
} | ||
} | ||
|
||
workgroupBarrier(); | ||
|
||
// Load 1 triangle's worth of vertex data per thread | ||
|
@@ -76,7 +78,7 @@ fn rasterize_cluster( | |
let vertex_0 = viewport_vertices[vertex_ids[2]]; | ||
let vertex_1 = viewport_vertices[vertex_ids[1]]; | ||
let vertex_2 = viewport_vertices[vertex_ids[0]]; | ||
let packed_ids = (cluster_id << 6u) | triangle_id; | ||
let packed_ids = (cluster_id << 7u) | triangle_id; | ||
|
||
// Compute triangle bounding box | ||
let min_x = u32(min3(vertex_0.x, vertex_1.x, vertex_2.x)); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After this change, is there a scenario where both of the following happen:
For Nanite simplification, all vertices mentioned in point 2 could probably be removed. While this might introduce artefacts, the error metric should show more detailed meshlets in such cases.
I suspect that to produce an optimal DAG ("Batched Multi Triangulation" section 3.2) you should always remove as many triangles as possible to reach 256 triangles (2 meshlets). The only constraint is Nanite's immutable shared vertices between meshlets. Any problems caused by simplification are shoved into the error metric.
See #14998 (comment) for context.