diff --git a/crates/encoding/src/clip.rs b/crates/encoding/src/clip.rs index 5a4e8f074..025b86bc5 100644 --- a/crates/encoding/src/clip.rs +++ b/crates/encoding/src/clip.rs @@ -41,3 +41,14 @@ pub struct Clip { pub struct ClipBbox { pub bbox: [f32; 4], } + +impl ClipBic { + pub fn new(a: u32, b: u32) -> Self { + ClipBic { a, b } + } + + pub fn combine(self, other: ClipBic) -> Self { + let m = self.b.min(other.a); + ClipBic::new(self.a + other.a - m, self.b + other.b - m) + } +} diff --git a/crates/encoding/src/config.rs b/crates/encoding/src/config.rs index 93ffe80c6..d853a0808 100644 --- a/crates/encoding/src/config.rs +++ b/crates/encoding/src/config.rs @@ -1,9 +1,11 @@ // Copyright 2023 The Vello authors // SPDX-License-Identifier: Apache-2.0 OR MIT +use crate::SegmentCount; + use super::{ - BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, Path, - PathBbox, PathMonoid, PathSegment, Tile, + BinHeader, Clip, ClipBbox, ClipBic, ClipElement, Cubic, DrawBbox, DrawMonoid, Layout, LineSoup, + Path, PathBbox, PathMonoid, PathSegment, Tile, }; use bytemuck::{Pod, Zeroable}; use std::mem; @@ -29,8 +31,24 @@ pub struct BumpAllocators { pub binning: u32, pub ptcl: u32, pub tile: u32, + pub seg_counts: u32, pub segments: u32, pub blend: u32, + pub lines: u32, +} + +/// Storage of indirect dispatch size values. +/// +/// The original plan was to reuse BumpAllocators, but the WebGPU compatible +/// usage list rules forbid that being used as indirect counts while also +/// bound as writable. +#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] +#[repr(C)] +pub struct IndirectCount { + pub count_x: u32, + pub count_y: u32, + pub count_z: u32, + pub pad0: u32, } /// Uniform render configuration data used by all GPU stages. @@ -114,7 +132,7 @@ pub struct WorkgroupCounts { pub path_scan1: WorkgroupSize, pub path_scan: WorkgroupSize, pub bbox_clear: WorkgroupSize, - pub path_seg: WorkgroupSize, + pub flatten: WorkgroupSize, pub draw_reduce: WorkgroupSize, pub draw_leaf: WorkgroupSize, pub clip_reduce: WorkgroupSize, @@ -159,7 +177,7 @@ impl WorkgroupCounts { path_scan1: (reduced_size / PATH_REDUCE_WG, 1, 1), path_scan: (path_tag_wgs, 1, 1), bbox_clear: (draw_object_wgs, 1, 1), - path_seg: (path_coarse_wgs, 1, 1), + flatten: (path_coarse_wgs, 1, 1), draw_reduce: (draw_object_wgs, 1, 1), draw_leaf: (draw_object_wgs, 1, 1), clip_reduce: (clip_reduce_wgs, 1, 1), @@ -248,11 +266,14 @@ pub struct BufferSizes { pub clip_bboxes: BufferSize, pub draw_bboxes: BufferSize, pub bump_alloc: BufferSize, + pub indirect_count: BufferSize, pub bin_headers: BufferSize, pub paths: BufferSize, // Bump allocated buffers + pub lines: BufferSize, pub bin_data: BufferSize, pub tiles: BufferSize, + pub seg_counts: BufferSize, pub segments: BufferSize, pub ptcl: BufferSize, } @@ -284,6 +305,7 @@ impl BufferSizes { let clip_bboxes = BufferSize::new(n_clips); let draw_bboxes = BufferSize::new(n_paths); let bump_alloc = BufferSize::new(1); + let indirect_count = BufferSize::new(1); let bin_headers = BufferSize::new(draw_object_wgs * 256); let n_paths_aligned = align_up(n_paths, 256); let paths = BufferSize::new(n_paths_aligned); @@ -293,6 +315,8 @@ impl BufferSizes { // reasonable heuristics. let bin_data = BufferSize::new(1 << 18); let tiles = BufferSize::new(1 << 21); + let lines = BufferSize::new(1 << 21); + let seg_counts = BufferSize::new(1 << 21); let segments = BufferSize::new(1 << 21); let ptcl = BufferSize::new(1 << 23); Self { @@ -311,10 +335,13 @@ impl BufferSizes { clip_bboxes, draw_bboxes, bump_alloc, + indirect_count, + lines, bin_headers, paths, bin_data, tiles, + seg_counts, segments, ptcl, } diff --git a/crates/encoding/src/lib.rs b/crates/encoding/src/lib.rs index 62a7d6c8e..3d24c192f 100644 --- a/crates/encoding/src/lib.rs +++ b/crates/encoding/src/lib.rs @@ -24,8 +24,8 @@ mod resolve; pub use binning::BinHeader; pub use clip::{Clip, ClipBbox, ClipBic, ClipElement}; pub use config::{ - BufferSize, BufferSizes, BumpAllocators, ConfigUniform, RenderConfig, WorkgroupCounts, - WorkgroupSize, + BufferSize, BufferSizes, BumpAllocators, ConfigUniform, IndirectCount, RenderConfig, + WorkgroupCounts, WorkgroupSize, }; pub use draw::{ DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid, @@ -35,7 +35,8 @@ pub use encoding::{Encoding, StreamOffsets}; pub use math::Transform; pub use monoid::Monoid; pub use path::{ - Cubic, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, PathTag, Tile, + Cubic, LineSoup, Path, PathBbox, PathEncoder, PathMonoid, PathSegment, PathSegmentType, + PathTag, SegmentCount, Tile, }; pub use resolve::{resolve_solid_paths_only, Layout}; diff --git a/crates/encoding/src/path.rs b/crates/encoding/src/path.rs index 10e7499dd..414ce23e6 100644 --- a/crates/encoding/src/path.rs +++ b/crates/encoding/src/path.rs @@ -6,6 +6,28 @@ use peniko::kurbo::Shape; use super::Monoid; +/// Line segment (after flattening, before tiling). +#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)] +#[repr(C)] +pub struct LineSoup { + pub path_ix: u32, + pub _padding: u32, + pub p0: [f32; 2], + pub p1: [f32; 2], +} + +/// Line segment (after flattening, before tiling). +#[derive(Clone, Copy, Debug, Zeroable, Pod, Default)] +#[repr(C)] +pub struct SegmentCount { + pub line_ix: u32, + // This could more accurately be modeled as: + // segment_within_line: u16, + // segment_within_slice: u16, + // However, here we mirror the way it's written in WGSL + pub counts: u32, +} + /// Path segment. #[derive(Clone, Copy, Debug, Zeroable, Pod, Default)] #[repr(C)] @@ -13,7 +35,7 @@ pub struct PathSegment { pub origin: [f32; 2], pub delta: [f32; 2], pub y_edge: f32, - pub next: u32, + pub _padding: u32, } /// Path segment type. @@ -193,7 +215,7 @@ pub struct PathBbox { #[repr(C)] pub struct Path { /// Bounding box in tiles. - pub bbox: [f32; 4], + pub bbox: [u32; 4], /// Offset (in u32s) to tile rectangle. pub tiles: u32, _padding: [u32; 3], diff --git a/shader/coarse.wgsl b/shader/coarse.wgsl index e497c85b4..2d324c798 100644 --- a/shader/coarse.wgsl +++ b/shader/coarse.wgsl @@ -33,7 +33,7 @@ var info_bin_data: array; var paths: array; @group(0) @binding(6) -var tiles: array; +var tiles: array; @group(0) @binding(7) var bump: BumpAllocators; @@ -82,31 +82,30 @@ fn alloc_cmd(size: u32) { } } -fn write_path(tile: Tile, linewidth: f32) -> bool { - // TODO: take flags - alloc_cmd(3u); - if linewidth < 0.0 { - let even_odd = linewidth < -1.0; - if tile.segments != 0u { - let fill = CmdFill(tile.segments, tile.backdrop); - ptcl[cmd_offset] = CMD_FILL; - let segments_and_rule = select(fill.tile << 1u, (fill.tile << 1u) | 1u, even_odd); - ptcl[cmd_offset + 1u] = segments_and_rule; - ptcl[cmd_offset + 2u] = u32(fill.backdrop); - cmd_offset += 3u; - } else { - if even_odd && (abs(tile.backdrop) & 1) == 0 { - return false; - } - ptcl[cmd_offset] = CMD_SOLID; - cmd_offset += 1u; - } +fn write_path(tile: Tile, tile_ix: u32, linewidth: f32) -> bool { + let even_odd = linewidth < -1.0; + // We overload the "segments" field to store both count (written by + // path_count stage) and segment allocation (used by path_tiling and + // fine). + let n_segs = tile.segments; + if n_segs != 0u { + var seg_ix = atomicAdd(&bump.segments, n_segs); + tiles[tile_ix].segments = ~seg_ix; + alloc_cmd(4u); + ptcl[cmd_offset] = CMD_FILL; + let size_and_rule = (n_segs << 1u) | u32(even_odd); + let fill = CmdFill(size_and_rule, seg_ix, tile.backdrop); + ptcl[cmd_offset + 1u] = fill.size_and_rule; + ptcl[cmd_offset + 2u] = fill.seg_data; + ptcl[cmd_offset + 3u] = u32(fill.backdrop); + cmd_offset += 4u; } else { - let stroke = CmdStroke(tile.segments, 0.5 * linewidth); - ptcl[cmd_offset] = CMD_STROKE; - ptcl[cmd_offset + 1u] = stroke.tile; - ptcl[cmd_offset + 2u] = bitcast(stroke.half_width); - cmd_offset += 3u; + if even_odd && (abs(tile.backdrop) & 1) == 0 { + return false; + } + alloc_cmd(1u); + ptcl[cmd_offset] = CMD_SOLID; + cmd_offset += 1u; } return true; } @@ -352,7 +351,7 @@ fn main( // DRAWTAG_FILL_COLOR case 0x44u: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { let rgba_color = scene[dd]; write_color(CmdColor(rgba_color)); } @@ -360,7 +359,7 @@ fn main( // DRAWTAG_FILL_LIN_GRADIENT case 0x114u: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { let index = scene[dd]; let info_offset = di + 1u; write_grad(CMD_LIN_GRAD, index, info_offset); @@ -369,7 +368,7 @@ fn main( // DRAWTAG_FILL_RAD_GRADIENT case 0x29cu: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { let index = scene[dd]; let info_offset = di + 1u; write_grad(CMD_RAD_GRAD, index, info_offset); @@ -378,7 +377,7 @@ fn main( // DRAWTAG_FILL_IMAGE case 0x248u: { let linewidth = bitcast(info_bin_data[di]); - if write_path(tile, linewidth) { + if write_path(tile, tile_ix, linewidth) { write_image(di + 1u); } } @@ -396,7 +395,7 @@ fn main( // DRAWTAG_END_CLIP case 0x21u: { clip_depth -= 1u; - write_path(tile, -1.0); + write_path(tile, tile_ix, -1.0); let blend = scene[dd]; let alpha = bitcast(scene[dd + 1u]); write_end_clip(CmdEndClip(blend, alpha)); diff --git a/shader/fine.wgsl b/shader/fine.wgsl index f64f0159b..79984596f 100644 --- a/shader/fine.wgsl +++ b/shader/fine.wgsl @@ -41,15 +41,10 @@ var gradients: texture_2d; var image_atlas: texture_2d; fn read_fill(cmd_ix: u32) -> CmdFill { - let tile = ptcl[cmd_ix + 1u]; - let backdrop = i32(ptcl[cmd_ix + 2u]); - return CmdFill(tile, backdrop); -} - -fn read_stroke(cmd_ix: u32) -> CmdStroke { - let tile = ptcl[cmd_ix + 1u]; - let half_width = bitcast(ptcl[cmd_ix + 2u]); - return CmdStroke(tile, half_width); + let size_and_rule = ptcl[cmd_ix + 1u]; + let seg_data = ptcl[cmd_ix + 2u]; + let backdrop = i32(ptcl[cmd_ix + 3u]); + return CmdFill(size_and_rule, seg_data, backdrop); } fn read_color(cmd_ix: u32) -> CmdColor { @@ -140,15 +135,15 @@ var output: texture_storage_2d; let PIXELS_PER_THREAD = 4u; -fn fill_path(tile: Tile, xy: vec2, even_odd: bool) -> array { +fn fill_path(seg_data: u32, n_segs: u32, backdrop: i32, xy: vec2, even_odd: bool) -> array { var area: array; - let backdrop_f = f32(tile.backdrop); + let backdrop_f = f32(backdrop); for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { area[i] = backdrop_f; } - var segment_ix = tile.segments; - while segment_ix != 0u { - let segment = segments[segment_ix]; + for (var i = 0u; i < n_segs; i++) { + let seg_off = seg_data + i; + let segment = segments[seg_off]; let y = segment.origin.y - xy.y; let y0 = clamp(y, 0.0, 1.0); let y1 = clamp(y + segment.delta.y, 0.0, 1.0); @@ -177,7 +172,6 @@ fn fill_path(tile: Tile, xy: vec2, even_odd: bool) -> array, even_odd: bool) -> array) -> array { - var df: array; - for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { - df[i] = 1e9; - } - var segment_ix = seg; - while segment_ix != 0u { - let segment = segments[segment_ix]; - let delta = segment.delta; - let dpos0 = xy + vec2(0.5, 0.5) - segment.origin; - let scale = 1.0 / dot(delta, delta); - for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { - let dpos = vec2(dpos0.x + f32(i), dpos0.y); - let t = clamp(dot(dpos, delta) * scale, 0.0, 1.0); - // performance idea: hoist sqrt out of loop - df[i] = min(df[i], length(delta * t - dpos)); - } - segment_ix = segment.next; - } - for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { - // reuse array; return alpha rather than distance - df[i] = clamp(half_width + 0.5 - df[i], 0.0, 1.0); - } - return df; -} - // The X size should be 16 / PIXELS_PER_THREAD @compute @workgroup_size(4, 16) fn main( @@ -250,16 +218,19 @@ fn main( // CMD_FILL case 1u: { let fill = read_fill(cmd_ix); - let segments = fill.tile >> 1u; - let even_odd = (fill.tile & 1u) != 0u; - let tile = Tile(fill.backdrop, segments); - area = fill_path(tile, xy, even_odd); - cmd_ix += 3u; + let n_segs = fill.size_and_rule >> 1u; + let even_odd = (fill.size_and_rule & 1u) != 0u; + area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd); + cmd_ix += 4u; } // CMD_STROKE case 2u: { - let stroke = read_stroke(cmd_ix); - area = stroke_path(stroke.tile, stroke.half_width, xy); + // Stroking in fine rasterization is disabled, as strokes will be expanded + // to fills earlier in the pipeline. This implementation is a stub, just to + // keep the shader from crashing. + for (var i = 0u; i < PIXELS_PER_THREAD; i++) { + area[i] = 0.0; + } cmd_ix += 3u; } // CMD_SOLID diff --git a/shader/pathseg.wgsl b/shader/flatten.wgsl similarity index 52% rename from shader/pathseg.wgsl rename to shader/flatten.wgsl index bf02419bc..b4197bce6 100644 --- a/shader/pathseg.wgsl +++ b/shader/flatten.wgsl @@ -1,20 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense -// Path segment decoding for the full case. - -// In the simple case, path segments are decoded as part of the coarse -// path rendering stage. In the full case, they are separated, as the -// decoding process also generates bounding boxes, and those in turn are -// used for tile allocation and clipping; actual coarse path rasterization -// can't proceed until those are complete. - -// There's some duplication of the decoding code but we won't worry about -// that just now. Perhaps it could be factored more nicely later. +// Flatten curves to lines #import config #import pathtag +#import segment #import cubic -#import transform +#import bump @group(0) @binding(0) var config: Config; @@ -38,37 +30,133 @@ struct AtomicPathBbox { var path_bboxes: array; @group(0) @binding(4) -var cubics: array; - -// Monoid is yagni, for future optimization - -// struct BboxMonoid { -// bbox: vec4, -// flags: u32, -// } - -// let FLAG_RESET_BBOX = 1u; -// let FLAG_SET_BBOX = 2u; - -// fn combine_bbox_monoid(a: BboxMonoid, b: BboxMonoid) -> BboxMonoid { -// var c: BboxMonoid; -// c.bbox = b.bbox; -// // TODO: previous-me thought this should be gated on b & SET_BBOX == false also -// if (a.flags & FLAG_RESET_BBOX) == 0u && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y { -// c.bbox = a.bbox; -// } else if (a.flags & FLAG_RESET_BBOX) == 0u && (b.flags & FLAG_SET_BBOX) == 0u || -// (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y) -// { -// c.bbox = vec4(min(a.bbox.xy, c.bbox.xy), max(a.bbox.xw, c.bbox.zw)); -// } -// c.flags = (a.flags & FLAG_SET_BBOX) | b.flags; -// c.flags |= (a.flags & FLAG_RESET_BBOX) << 1u; -// return c; -// } - -// fn bbox_monoid_identity() -> BboxMonoid { -// return BboxMonoid(); -// } +var bump: BumpAllocators; + +@group(0) @binding(5) +var lines: array; + +struct SubdivResult { + val: f32, + a0: f32, + a2: f32, +} + +let D = 0.67; +fn approx_parabola_integral(x: f32) -> f32 { + return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); +} + +let B = 0.39; +fn approx_parabola_inv_integral(x: f32) -> f32 { + return x * sqrt(1.0 - B + (B * B + 0.5 * x * x)); +} + +fn estimate_subdiv(p0: vec2, p1: vec2, p2: vec2, sqrt_tol: f32) -> SubdivResult { + let d01 = p1 - p0; + let d12 = p2 - p1; + let dd = d01 - d12; + let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; + let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9); + let x0 = dot(d01, dd) * cross_inv; + let x2 = dot(d12, dd) * cross_inv; + let scale = abs(cross / (length(dd) * (x2 - x0))); + + let a0 = approx_parabola_integral(x0); + let a2 = approx_parabola_integral(x2); + var val = 0.0; + if scale < 1e9 { + let da = abs(a2 - a0); + let sqrt_scale = sqrt(scale); + if sign(x0) == sign(x2) { + val = sqrt_scale; + } else { + let xmin = sqrt_tol / sqrt_scale; + val = sqrt_tol / approx_parabola_integral(xmin); + } + val *= da; + } + return SubdivResult(val, a0, a2); +} + +fn eval_quad(p0: vec2, p1: vec2, p2: vec2, t: f32) -> vec2 { + let mt = 1.0 - t; + return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; +} + +fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32) -> vec2 { + let mt = 1.0 - t; + return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; +} + +let MAX_QUADS = 16u; + +fn flatten_cubic(cubic: Cubic) { + let p0 = cubic.p0; + let p1 = cubic.p1; + let p2 = cubic.p2; + let p3 = cubic.p3; + let err_v = 3.0 * (p2 - p1) + p0 - p3; + let err = dot(err_v, err_v); + let ACCURACY = 0.25; + let Q_ACCURACY = ACCURACY * 0.1; + let REM_ACCURACY = ACCURACY - Q_ACCURACY; + let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY; + var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u); + n_quads = min(n_quads, MAX_QUADS); + var keep_params: array; + var val = 0.0; + var qp0 = p0; + let step = 1.0 / f32(n_quads); + for (var i = 0u; i < n_quads; i += 1u) { + let t = f32(i + 1u) * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); + let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u); + var lp0 = p0; + qp0 = p0; + let v_step = val / f32(n); + var n_out = 1u; + var val_sum = 0.0; + for (var i = 0u; i < n_quads; i += 1u) { + let t = f32(i + 1u) * step; + let qp2 = eval_cubic(p0, p1, p2, p3, t); + var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); + qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); + let params = keep_params[i]; + let u0 = approx_parabola_inv_integral(params.a0); + let u2 = approx_parabola_inv_integral(params.a2); + let uscale = 1.0 / (u2 - u0); + var val_target = f32(n_out) * v_step; + while n_out == n || val_target < val_sum + params.val { + var lp1: vec2; + if n_out == n { + lp1 = p3; + } else { + let u = (val_target - val_sum) / params.val; + let a = mix(params.a0, params.a2, u); + let au = approx_parabola_inv_integral(a); + let t = (au - u0) * uscale; + lp1 = eval_quad(qp0, qp1, qp2, t); + } + + // Output line segment lp0..lp1 + let line_ix = atomicAdd(&bump.lines, 1u); + // TODO: check failure + lines[line_ix] = LineSoup(cubic.path_ix, lp0, lp1); + n_out += 1u; + val_target += v_step; + lp0 = lp1; + } + val_sum += params.val; + qp0 = qp2; + } +} var pathdata_base: u32; @@ -85,6 +173,11 @@ fn read_i16_point(ix: u32) -> vec2 { return vec2(x, y); } +struct Transform { + matrx: vec4, + translate: vec2, +} + fn read_transform(transform_base: u32, ix: u32) -> Transform { let base = transform_base + ix * 6u; let c0 = bitcast(scene[base]); @@ -98,6 +191,10 @@ fn read_transform(transform_base: u32, ix: u32) -> Transform { return Transform(matrx, translate); } +fn transform_apply(transform: Transform, p: vec2) -> vec2 { + return transform.matrx.xy * p.x + transform.matrx.zw * p.y + transform.translate; +} + fn round_down(x: f32) -> i32 { return i32(floor(x)); } @@ -116,6 +213,7 @@ fn main( pathdata_base = config.pathdata_base; let shift = (ix & 3u) * 8u; var tm = reduce_tag(tag_word & ((1u << shift) - 1u)); + // TODO: this can be a read buf overflow. Conditionalize by tag byte? tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm); var tag_byte = (tag_word >> shift) & 0xffu; @@ -181,7 +279,7 @@ fn main( bbox += vec4(-stroke, stroke); } let flags = u32(linewidth >= 0.0); - cubics[global_id.x] = Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags); + flatten_cubic(Cubic(p0, p1, p2, p3, stroke, tm.path_ix, flags)); // Update bounding box using atomics only. Computing a monoid is a // potential future optimization. if bbox.z > bbox.x || bbox.w > bbox.y { diff --git a/shader/path_coarse.wgsl b/shader/path_coarse.wgsl deleted file mode 100644 index 5f168a63c..000000000 --- a/shader/path_coarse.wgsl +++ /dev/null @@ -1,309 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -#import config -#import pathtag - -@group(0) @binding(0) -var config: Config; - -@group(0) @binding(1) -var scene: array; - -@group(0) @binding(2) -var tag_monoids: array; - -#ifdef cubics_out -@group(0) @binding(3) -var output: array>; -#else -// We don't get this from import as it's the atomic version -struct AtomicTile { - backdrop: atomic, - segments: atomic, -} - -#import segment - -@group(0) @binding(3) -var tiles: array; - -@group(0) @binding(4) -var segments: array; -#endif - -var pathdata_base: u32; - -fn read_f32_point(ix: u32) -> vec2 { - let x = bitcast(scene[pathdata_base + ix]); - let y = bitcast(scene[pathdata_base + ix + 1u]); - return vec2(x, y); -} - -fn read_i16_point(ix: u32) -> vec2 { - let raw = scene[pathdata_base + ix]; - let x = f32(i32(raw << 16u) >> 16u); - let y = f32(i32(raw) >> 16u); - return vec2(x, y); -} - -#ifndef cubics_out -struct SubdivResult { - val: f32, - a0: f32, - a2: f32, -} - -let D = 0.67; -fn approx_parabola_integral(x: f32) -> f32 { - return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); -} - -let B = 0.39; -fn approx_parabola_inv_integral(x: f32) -> f32 { - return x * sqrt(1.0 - B + (B * B + 0.5 * x * x)); -} - -fn estimate_subdiv(p0: vec2, p1: vec2, p2: vec2, sqrt_tol: f32) -> SubdivResult { - let d01 = p1 - p0; - let d12 = p2 - p1; - let dd = d01 - d12; - let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; - let cross_inv = 1.0 / cross; - let x0 = dot(d01, dd) * cross_inv; - let x2 = dot(d12, dd) * cross_inv; - let scale = abs(cross / (length(dd) * (x2 - x0))); - - let a0 = approx_parabola_integral(x0); - let a2 = approx_parabola_integral(x2); - var val = 0.0; - if scale < 1e9 { - let da = abs(a2 - a0); - let sqrt_scale = sqrt(scale); - if sign(x0) == sign(x2) { - val = sqrt_scale; - } else { - let xmin = sqrt_tol / sqrt_scale; - val = sqrt_tol / approx_parabola_integral(xmin); - } - val *= da; - } - return SubdivResult(val, a0, a2); -} - -fn eval_quad(p0: vec2, p1: vec2, p2: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; -} - -fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; -} - -fn alloc_segment() -> u32 { - // Use 0-index segment (address is sentinel) as counter - // TODO: separate small buffer binding for this? - return atomicAdd(&tiles[4096].segments, 1u) + 1u; -} -#endif - -let MAX_QUADS = 16u; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, -) { - // Obtain exclusive prefix sum of tag monoid - let ix = global_id.x; - let tag_word = scene[config.pathtag_base + (ix >> 2u)]; - pathdata_base = config.pathdata_base; - let shift = (ix & 3u) * 8u; - var tm = reduce_tag(tag_word & ((1u << shift) - 1u)); - tm = combine_tag_monoid(tag_monoids[ix >> 2u], tm); - var tag_byte = (tag_word >> shift) & 0xffu; - // should be extractBits(tag_word, shift, 8)? - - // Decode path data - let seg_type = tag_byte & PATH_TAG_SEG_TYPE; - if seg_type != 0u { - var p0: vec2; - var p1: vec2; - var p2: vec2; - var p3: vec2; - if (tag_byte & PATH_TAG_F32) != 0u { - p0 = read_f32_point(tm.pathseg_offset); - p1 = read_f32_point(tm.pathseg_offset + 2u); - if seg_type >= PATH_TAG_QUADTO { - p2 = read_f32_point(tm.pathseg_offset + 4u); - if seg_type == PATH_TAG_CUBICTO { - p3 = read_f32_point(tm.pathseg_offset + 6u); - } - } - } else { - p0 = read_i16_point(tm.pathseg_offset); - p1 = read_i16_point(tm.pathseg_offset + 1u); - if seg_type >= PATH_TAG_QUADTO { - p2 = read_i16_point(tm.pathseg_offset + 2u); - if seg_type == PATH_TAG_CUBICTO { - p3 = read_i16_point(tm.pathseg_offset + 3u); - } - } - } - // TODO: transform goes here - // Degree-raise - if seg_type == PATH_TAG_LINETO { - p3 = p1; - p2 = mix(p3, p0, 1.0 / 3.0); - p1 = mix(p0, p3, 1.0 / 3.0); - } else if seg_type == PATH_TAG_QUADTO { - p3 = p2; - p2 = mix(p1, p2, 1.0 / 3.0); - p1 = mix(p1, p0, 1.0 / 3.0); - } -#ifdef cubics_out - let out_ix = ix * 4u; - output[out_ix] = p0; - output[out_ix + 1u] = p1; - output[out_ix + 2u] = p2; - output[out_ix + 3u] = p3; -#else - let err_v = 3.0 * (p2 - p1) + p0 - p3; - let err = dot(err_v, err_v); - let ACCURACY = 0.25; - let Q_ACCURACY = ACCURACY * 0.1; - let REM_ACCURACY = (ACCURACY - Q_ACCURACY); - let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY; - var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u); - n_quads = min(n_quads, MAX_QUADS); - var keep_params: array; - var val = 0.0; - var qp0 = p0; - let step = 1.0 / f32(n_quads); - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); - keep_params[i] = params; - val += params.val; - qp0 = qp2; - } - let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u); - var lp0 = p0; - qp0 = p0; - let v_step = val / f32(n); - var n_out = 1u; - var val_sum = 0.0; - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = keep_params[i]; - let u0 = approx_parabola_inv_integral(params.a0); - let u2 = approx_parabola_inv_integral(params.a2); - let uscale = 1.0 / (u2 - u0); - var val_target = f32(n_out) * v_step; - while n_out == n || val_target < val_sum + params.val { - var lp1: vec2; - if n_out == n { - lp1 = p3; - } else { - let u = (val_target - val_sum) / params.val; - let a = mix(params.a0, params.a2, u); - let au = approx_parabola_inv_integral(a); - let t = (au - u0) * uscale; - lp1 = eval_quad(qp0, qp1, qp2, t); - } - - // Output line segment lp0..lp1 - let xymin = min(lp0, lp1); - let xymax = max(lp0, lp1); - let dp = lp1 - lp0; - let recip_dx = 1.0 / dp.x; - let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9); - let c = 0.5 * abs(invslope); - let b = invslope; - let SX = 1.0 / f32(TILE_WIDTH); - let SY = 1.0 / f32(TILE_HEIGHT); - let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX; - var x0 = i32(floor(xymin.x * SX)); - var x1 = i32(floor(xymax.x * SX) + 1.0); - var y0 = i32(floor(xymin.y * SY)); - var y1 = i32(floor(xymax.y * SY) + 1.0); - x0 = clamp(x0, 0, i32(config.width_in_tiles)); - x1 = clamp(x1, 0, i32(config.width_in_tiles)); - y0 = clamp(y0, 0, i32(config.height_in_tiles)); - y1 = clamp(y1, 0, i32(config.height_in_tiles)); - var xc = a + b * f32(y0); - var xray = i32(floor(lp0.x * SX)); - var last_xray = i32(floor(lp1.x * SX)); - if dp.y < 0.0 { - let tmp = xray; - xray = last_xray; - last_xray = tmp; - } - for (var y = y0; y < y1; y += 1) { - let tile_y0 = f32(y) * f32(TILE_HEIGHT); - let xbackdrop = max(xray + 1, 0); - if xymin.y < tile_y0 && xbackdrop < i32(config.width_in_tiles) { - let backdrop = select(-1, 1, dp.y < 0.0); - let tile_ix = y * i32(config.width_in_tiles) + xbackdrop; - atomicAdd(&tiles[tile_ix].backdrop, backdrop); - } - var next_xray = last_xray; - if y + 1 < y1 { - let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT); - let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope; - next_xray = i32(floor(x_edge * SX)); - } - let min_xray = min(xray, next_xray); - let max_xray = max(xray, next_xray); - var xx0 = min(i32(floor(xc - c)), min_xray); - var xx1 = max(i32(ceil(xc + c)), max_xray + 1); - xx0 = clamp(xx0, x0, x1); - xx1 = clamp(xx1, x0, x1); - var tile_seg: Segment; - for (var x = xx0; x < xx1; x += 1) { - let tile_x0 = f32(x) * f32(TILE_WIDTH); - let tile_ix = y * i32(config.width_in_tiles) + x; - // allocate segment, insert linked list - let seg_ix = alloc_segment(); - let old = atomicExchange(&tiles[tile_ix].segments, seg_ix); - tile_seg.origin = lp0; - tile_seg.delta = dp; - var y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx); - if xymin.x < tile_x0 { - let p = vec2(tile_x0, y_edge); - if dp.x < 0.0 { - tile_seg.delta = p - lp0; - } else { - tile_seg.origin = p; - tile_seg.delta = lp1 - p; - } - if tile_seg.delta.x == 0.0 { - tile_seg.delta.x = sign(dp.x) * 1e-9; - } - } - if x <= min_xray || max_xray < x { - y_edge = 1e9; - } - tile_seg.y_edge = y_edge; - tile_seg.next = old; - segments[seg_ix] = tile_seg; - } - xc += b; - xray = next_xray; - } - n_out += 1u; - val_target += v_step; - lp0 = lp1; - } - val_sum += params.val; - qp0 = qp2; - } -#endif - } -} diff --git a/shader/path_coarse_full.wgsl b/shader/path_coarse_full.wgsl deleted file mode 100644 index cf03301b6..000000000 --- a/shader/path_coarse_full.wgsl +++ /dev/null @@ -1,273 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -// Path coarse rasterization for the full implementation. - -#import config -#import pathtag -#import tile -#import segment -#import cubic -#import bump - -@group(0) @binding(0) -var config: Config; - -@group(0) @binding(1) -var scene: array; - -@group(0) @binding(2) -var cubics: array; - -@group(0) @binding(3) -var paths: array; - -// We don't get this from import as it's the atomic version -struct AtomicTile { - backdrop: atomic, - segments: atomic, -} - -@group(0) @binding(4) -var bump: BumpAllocators; - -@group(0) @binding(5) -var tiles: array; - -@group(0) @binding(6) -var segments: array; - -struct SubdivResult { - val: f32, - a0: f32, - a2: f32, -} - -let D = 0.67; -fn approx_parabola_integral(x: f32) -> f32 { - return x * inverseSqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); -} - -let B = 0.39; -fn approx_parabola_inv_integral(x: f32) -> f32 { - return x * sqrt(1.0 - B + (B * B + 0.5 * x * x)); -} - -fn estimate_subdiv(p0: vec2, p1: vec2, p2: vec2, sqrt_tol: f32) -> SubdivResult { - let d01 = p1 - p0; - let d12 = p2 - p1; - let dd = d01 - d12; - let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; - let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9); - let x0 = dot(d01, dd) * cross_inv; - let x2 = dot(d12, dd) * cross_inv; - let scale = abs(cross / (length(dd) * (x2 - x0))); - - let a0 = approx_parabola_integral(x0); - let a2 = approx_parabola_integral(x2); - var val = 0.0; - if scale < 1e9 { - let da = abs(a2 - a0); - let sqrt_scale = sqrt(scale); - if sign(x0) == sign(x2) { - val = sqrt_scale; - } else { - let xmin = sqrt_tol / sqrt_scale; - val = sqrt_tol / approx_parabola_integral(xmin); - } - val *= da; - } - return SubdivResult(val, a0, a2); -} - -fn eval_quad(p0: vec2, p1: vec2, p2: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; -} - -fn eval_cubic(p0: vec2, p1: vec2, p2: vec2, p3: vec2, t: f32) -> vec2 { - let mt = 1.0 - t; - return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; -} - -fn alloc_segment() -> u32 { - var offset = atomicAdd(&bump.segments, 1u) + 1u; - if offset + 1u > config.segments_size { - offset = 0u; - atomicOr(&bump.failed, STAGE_PATH_COARSE); - } - return offset; -} - -let MAX_QUADS = 16u; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, -) { - // Exit early if prior stages failed, as we can't run this stage. - // We need to check only prior stages, as if this stage has failed in another workgroup, - // we still want to know this workgroup's memory requirement. - if (atomicLoad(&bump.failed) & (STAGE_BINNING | STAGE_TILE_ALLOC)) != 0u { - return; - } - let ix = global_id.x; - let tag_word = scene[config.pathtag_base + (ix >> 2u)]; - let shift = (ix & 3u) * 8u; - var tag_byte = (tag_word >> shift) & 0xffu; - - if (tag_byte & PATH_TAG_SEG_TYPE) != 0u { - // Discussion question: it might actually be cheaper to do the path segment - // decoding & transform again rather than store the result in a buffer; - // classic memory vs ALU tradeoff. - let cubic = cubics[global_id.x]; - let path = paths[cubic.path_ix]; - let is_stroke = (cubic.flags & CUBIC_IS_STROKE) != 0u; - let bbox = vec4(path.bbox); - let p0 = cubic.p0; - let p1 = cubic.p1; - let p2 = cubic.p2; - let p3 = cubic.p3; - let err_v = 3.0 * (p2 - p1) + p0 - p3; - let err = dot(err_v, err_v); - let ACCURACY = 0.25; - let Q_ACCURACY = ACCURACY * 0.1; - let REM_ACCURACY = (ACCURACY - Q_ACCURACY); - let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY; - var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1u); - n_quads = min(n_quads, MAX_QUADS); - var keep_params: array; - var val = 0.0; - var qp0 = p0; - let step = 1.0 / f32(n_quads); - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); - keep_params[i] = params; - val += params.val; - qp0 = qp2; - } - let n = max(u32(ceil(val * (0.5 / sqrt(REM_ACCURACY)))), 1u); - var lp0 = p0; - qp0 = p0; - let v_step = val / f32(n); - var n_out = 1u; - var val_sum = 0.0; - for (var i = 0u; i < n_quads; i += 1u) { - let t = f32(i + 1u) * step; - let qp2 = eval_cubic(p0, p1, p2, p3, t); - var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step); - qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); - let params = keep_params[i]; - let u0 = approx_parabola_inv_integral(params.a0); - let u2 = approx_parabola_inv_integral(params.a2); - let uscale = 1.0 / (u2 - u0); - var val_target = f32(n_out) * v_step; - while n_out == n || val_target < val_sum + params.val { - var lp1: vec2; - if n_out == n { - lp1 = p3; - } else { - let u = (val_target - val_sum) / params.val; - let a = mix(params.a0, params.a2, u); - let au = approx_parabola_inv_integral(a); - let t = (au - u0) * uscale; - lp1 = eval_quad(qp0, qp1, qp2, t); - } - - // Output line segment lp0..lp1 - let xymin = min(lp0, lp1) - cubic.stroke; - let xymax = max(lp0, lp1) + cubic.stroke; - let dp = lp1 - lp0; - let recip_dx = 1.0 / dp.x; - let invslope = select(dp.x / dp.y, 1.0e9, abs(dp.y) < 1.0e-9); - let SX = 1.0 / f32(TILE_WIDTH); - let SY = 1.0 / f32(TILE_HEIGHT); - let c = (cubic.stroke.x + abs(invslope) * (0.5 * f32(TILE_HEIGHT) + cubic.stroke.y)) * SX; - let b = invslope; - let a = (lp0.x - (lp0.y - 0.5 * f32(TILE_HEIGHT)) * b) * SX; - var x0 = i32(floor(xymin.x * SX)); - var x1 = i32(floor(xymax.x * SX) + 1.0); - var y0 = i32(floor(xymin.y * SY)); - var y1 = i32(floor(xymax.y * SY) + 1.0); - x0 = clamp(x0, bbox.x, bbox.z); - x1 = clamp(x1, bbox.x, bbox.z); - y0 = clamp(y0, bbox.y, bbox.w); - y1 = clamp(y1, bbox.y, bbox.w); - var xc = a + b * f32(y0); - let stride = bbox.z - bbox.x; - var base = i32(path.tiles) + (y0 - bbox.y) * stride - bbox.x; - var xray = i32(floor(lp0.x * SX)); - var last_xray = i32(floor(lp1.x * SX)); - if dp.y < 0.0 { - let tmp = xray; - xray = last_xray; - last_xray = tmp; - } - for (var y = y0; y < y1; y += 1) { - let tile_y0 = f32(y) * f32(TILE_HEIGHT); - let xbackdrop = max(xray + 1, bbox.x); - if !is_stroke && xymin.y < tile_y0 && xbackdrop < bbox.z { - let backdrop = select(-1, 1, dp.y < 0.0); - let tile_ix = base + xbackdrop; - atomicAdd(&tiles[tile_ix].backdrop, backdrop); - } - var next_xray = last_xray; - if y + 1 < y1 { - let tile_y1 = f32(y + 1) * f32(TILE_HEIGHT); - let x_edge = lp0.x + (tile_y1 - lp0.y) * invslope; - next_xray = i32(floor(x_edge * SX)); - } - let min_xray = min(xray, next_xray); - let max_xray = max(xray, next_xray); - var xx0 = min(i32(floor(xc - c)), min_xray); - var xx1 = max(i32(ceil(xc + c)), max_xray + 1); - xx0 = clamp(xx0, x0, x1); - xx1 = clamp(xx1, x0, x1); - var tile_seg: Segment; - for (var x = xx0; x < xx1; x += 1) { - let tile_x0 = f32(x) * f32(TILE_WIDTH); - let tile_ix = base + x; - // allocate segment, insert linked list - let seg_ix = alloc_segment(); - let old = atomicExchange(&tiles[tile_ix].segments, seg_ix); - tile_seg.origin = lp0; - tile_seg.delta = dp; - var y_edge = 0.0; - if !is_stroke { - y_edge = mix(lp0.y, lp1.y, (tile_x0 - lp0.x) * recip_dx); - if xymin.x < tile_x0 { - let p = vec2(tile_x0, y_edge); - if dp.x < 0.0 { - tile_seg.delta = p - lp0; - } else { - tile_seg.origin = p; - tile_seg.delta = lp1 - p; - } - if tile_seg.delta.x == 0.0 { - tile_seg.delta.x = sign(dp.x) * 1e-9; - } - } - if x <= min_xray || max_xray < x { - y_edge = 1e9; - } - } - tile_seg.y_edge = y_edge; - tile_seg.next = old; - segments[seg_ix] = tile_seg; - } - xc += b; - base += stride; - xray = next_xray; - } - n_out += 1u; - val_target += v_step; - lp0 = lp1; - } - val_sum += params.val; - qp0 = qp2; - } - } -} diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl new file mode 100644 index 000000000..3cf4e4c02 --- /dev/null +++ b/shader/path_count.wgsl @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Stage to compute counts of number of segments in each tile + +#import bump +#import config +#import segment +#import tile + +@group(0) @binding(0) +var config: Config; + +// TODO: this is cut'n'pasted from path_coarse. +struct AtomicTile { + backdrop: atomic, + // This is "segments" but we're renaming + count: atomic, +} + +@group(0) @binding(1) +var bump: BumpAllocators; + +@group(0) @binding(2) +var lines: array; + +@group(0) @binding(3) +var paths: array; + +@group(0) @binding(4) +var tile: array; + +@group(0) @binding(5) +var seg_counts: array; + +// number of integer cells spanned by interval defined by a, b +fn span(a: f32, b: f32) -> u32 { + return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0)); +} + +// Note regarding clipping to bounding box: +// +// We have to do the backdrop bumps for all tiles to the left of the bbox. +// This should probably be a separate loop. This also causes a numerical +// robustness issue. + +// This shader is dispatched with one thread for each line. +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, +) { + let n_lines = atomicLoad(&bump.lines); + var count = 0u; + if global_id.x < n_lines { + let line = lines[global_id.x]; + // coarse rasterization logic to count number of tiles touched by line + let is_down = line.p1.y >= line.p0.y; + let xy0 = select(line.p1, line.p0, is_down); + let xy1 = select(line.p0, line.p1, is_down); + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + // TODO: clip line to bounding box + count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u; + let line_ix = global_id.x; + + let dx = abs(s1.x - s0.x); + let dy = s1.y - s0.y; + if dx + dy == 0.0 { + // Zero-length segment, drop it. Note, this should be culled earlier. + return; + } + if dy == 0.0 && floor(s0.y) == s0.y { + return; + } + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = select(-1.0, 1.0, is_positive_slope); + let xt0 = floor(s0.x * sign); + let c = s0.x * sign - xt0; + let y0 = floor(s0.y); + let ytop = select(y0 + 1.0, ceil(s0.y), s0.y == s1.y); + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; + let x0 = xt0 * sign + select(-1.0, 0.0, is_positive_slope); + + let path = paths[line.path_ix]; + let bbox = vec4(path.bbox); + let xmin = min(s0.x, s1.x); + // If line is to left of bbox, we may still need to do backdrop + let stride = bbox.z - bbox.x; + if s0.y >= f32(bbox.w) || s1.y <= f32(bbox.y) || xmin >= f32(bbox.z) || stride == 0 { + return; + } + // Clip line to bounding box. Clipping is done in "i" space. + var imin = 0u; + if s0.y < f32(bbox.y) { + var iminf = round((f32(bbox.y) - y0 + b - a) / (1.0 - a)) - 1.0; + // Numerical robustness: goal is to find the first i value for which + // the following predicate is false. Above formula is designed to + // undershoot by 0.5. + if y0 + iminf - floor(a * iminf + b) < f32(bbox.y) { + iminf += 1.0; + } + imin = u32(iminf); + } + var imax = count; + if s1.y > f32(bbox.w) { + var imaxf = round((f32(bbox.w) - y0 + b - a) / (1.0 - a)) - 1.0; + if y0 + imaxf - floor(a * imaxf + b) < f32(bbox.w) { + imaxf += 1.0; + } + imax = u32(imaxf); + } + let delta = select(1, -1, is_down); + var ymin = 0; + var ymax = 0; + if max(s0.x, s1.x) <= f32(bbox.x) { + ymin = i32(ceil(s0.y)); + ymax = i32(ceil(s1.y)); + imax = imin; + } else { + let fudge = select(1.0, 0.0, is_positive_slope); + if xmin < f32(bbox.x) { + var f = round((sign * (f32(bbox.x) - x0) - b + fudge) / a); + if (x0 + sign * floor(a * f + b) < f32(bbox.x)) == is_positive_slope { + f += 1.0; + } + let ynext = i32(y0 + f - floor(a * f + b) + 1.0); + if is_positive_slope { + if u32(f) > imin { + ymin = i32(y0 + select(1.0, 0.0, y0 == s0.y)); + ymax = ynext; + imin = u32(f); + } + } else { + if u32(f) < imax { + ymin = ynext; + ymax = i32(ceil(s1.y)); + imax = u32(f); + } + } + } + if max(s0.x, s1.x) > f32(bbox.z) { + var f = round((sign * (f32(bbox.z) - x0) - b + fudge) / a); + if (x0 + sign * floor(a * f + b) < f32(bbox.z)) == is_positive_slope { + f += 1.0; + } + if is_positive_slope { + imax = min(imax, u32(f)); + } else { + imin = max(imin, u32(f)); + } + } + } + imax = max(imin, imax); + // Apply backdrop for part of line left of bbox + ymin = max(ymin, bbox.y); + ymax = min(ymax, bbox.w); + for (var y = ymin; y < ymax; y++) { + let base = i32(path.tiles) + (y - bbox.y) * stride; + atomicAdd(&tile[base].backdrop, delta); + } + var last_z = floor(a * (f32(imin) - 1.0) + b); + let seg_base = atomicAdd(&bump.seg_counts, imax - imin); + for (var i = imin; i < imax; i++) { + let subix = i; + // coarse rasterization logic + // Note: we hope fast-math doesn't strength reduce this. + let zf = a * f32(subix) + b; + let z = floor(zf); + // x, y are tile coordinates relative to render target + let y = i32(y0 + f32(subix) - z); + let x = i32(x0 + sign * z); + let base = i32(path.tiles) + (y - bbox.y) * stride - bbox.x; + let top_edge = select(last_z == z, y0 == s0.y, subix == 0u); + if top_edge && x + 1 < bbox.z { + let x_bump = max(x + 1, bbox.x); + atomicAdd(&tile[base + x_bump].backdrop, delta); + } + let seg_within_slice = atomicAdd(&tile[base + x].count, 1u); + // Pack two count values into a single u32 + let counts = (seg_within_slice << 16u) | subix; + let seg_count = SegmentCount(line_ix, counts); + seg_counts[seg_base + i - imin] = seg_count; + // Note: since we're iterating, we have a reliable value for + // last_z. + last_z = z; + } + } +} diff --git a/shader/path_count_setup.wgsl b/shader/path_count_setup.wgsl new file mode 100644 index 000000000..95af29c38 --- /dev/null +++ b/shader/path_count_setup.wgsl @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Set up dispatch size for path count stage. + +#import bump + +@group(0) @binding(0) +var bump: BumpAllocators; + +@group(0) @binding(1) +var indirect: IndirectCount; + +// Partition size for path count stage +let WG_SIZE = 256u; + +@compute @workgroup_size(1) +fn main() { + let lines = atomicLoad(&bump.lines); + indirect.count_x = (lines + (WG_SIZE - 1u)) / WG_SIZE; + indirect.count_y = 1u; + indirect.count_z = 1u; +} diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl new file mode 100644 index 000000000..444974e6a --- /dev/null +++ b/shader/path_tiling.wgsl @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Write path segments + +#import bump +#import config +#import segment +#import tile + +@group(0) @binding(0) +var bump: BumpAllocators; + +@group(0) @binding(1) +var seg_counts: array; + +@group(0) @binding(2) +var lines: array; + +@group(0) @binding(3) +var paths: array; + +@group(0) @binding(4) +var tiles: array; + +@group(0) @binding(5) +var segments: array; + +fn span(a: f32, b: f32) -> u32 { + return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0)); +} + +// One invocation for each tile that is to be written. +// Total number of invocations = bump.seg_counts +@compute @workgroup_size(256) +fn main( + @builtin(global_invocation_id) global_id: vec3, +) { + let n_segments = atomicLoad(&bump.seg_counts); + if global_id.x < n_segments { + let seg_count = seg_counts[global_id.x]; + let line = lines[seg_count.line_ix]; + let counts = seg_count.counts; + let seg_within_slice = counts >> 16u; + let seg_within_line = counts & 0xffffu; + + // coarse rasterization logic + let is_down = line.p1.y >= line.p0.y; + var xy0 = select(line.p1, line.p0, is_down); + var xy1 = select(line.p0, line.p1, is_down); + let s0 = xy0 * TILE_SCALE; + let s1 = xy1 * TILE_SCALE; + let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u; + let dx = abs(s1.x - s0.x); + let dy = s1.y - s0.y; + let idxdy = 1.0 / (dx + dy); + let a = dx * idxdy; + let is_positive_slope = s1.x >= s0.x; + let sign = select(-1.0, 1.0, is_positive_slope); + let xt0 = floor(s0.x * sign); + let c = s0.x * sign - xt0; + let y0i = floor(s0.y); + let ytop = select(y0i + 1.0, ceil(s0.y), s0.y == s1.y); + let b = (dy * c + dx * (ytop - s0.y)) * idxdy; + let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0)); + let z = floor(a * f32(seg_within_line) + b); + let x = x0i + i32(sign * z); + let y = i32(y0i + f32(seg_within_line) - z); + + let path = paths[line.path_ix]; + let bbox = vec4(path.bbox); + let stride = bbox.z - bbox.x; + let tile_ix = i32(path.tiles) + (y - bbox.y) * stride + x - bbox.x; + let tile = tiles[tile_ix]; + let seg_start = ~tile.segments; + if i32(seg_start) < 0 { + return; + } + let tile_xy = vec2(f32(x) * f32(TILE_WIDTH), f32(y) * f32(TILE_HEIGHT)); + let tile_xy1 = tile_xy + vec2(f32(TILE_WIDTH), f32(TILE_HEIGHT)); + + if seg_within_line > 0u { + let z_prev = floor(a * (f32(seg_within_line) - 1.0) + b); + if z == z_prev { + // Top edge is clipped + var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy.y - xy0.y) / (xy1.y - xy0.y); + // TODO: we want to switch to tile-relative coordinates + xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x); + xy0 = vec2(xt, tile_xy.y); + } else { + // If is_positive_slope, left edge is clipped, otherwise right + let x_clip = select(tile_xy1.x, tile_xy.x, is_positive_slope); + var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y); + xy0 = vec2(x_clip, yt); + } + } + if seg_within_line < count - 1u { + let z_next = floor(a * (f32(seg_within_line) + 1.0) + b); + if z == z_next { + // Bottom edge is clipped + var xt = xy0.x + (xy1.x - xy0.x) * (tile_xy1.y - xy0.y) / (xy1.y - xy0.y); + xt = clamp(xt, tile_xy.x + 1e-3, tile_xy1.x); + xy1 = vec2(xt, tile_xy1.y); + } else { + // If is_positive_slope, right edge is clipped, otherwise left + let x_clip = select(tile_xy.x, tile_xy1.x, is_positive_slope); + var yt = xy0.y + (xy1.y - xy0.y) * (x_clip - xy0.x) / (xy1.x - xy0.x); + yt = clamp(yt, tile_xy.y + 1e-3, tile_xy1.y); + xy1 = vec2(x_clip, yt); + } + } + // See comments in CPU version of shader + var y_edge = 1e9; + if xy0.x == tile_xy.x { + y_edge = xy0.y; + } else if xy1.x == tile_xy.x { + y_edge = xy1.y; + } + if !is_down { + let tmp = xy0; + xy0 = xy1; + xy1 = tmp; + } + let segment = Segment(xy0, xy1 - xy0, y_edge); + segments[seg_start + seg_within_slice] = segment; + } +} diff --git a/shader/path_tiling_setup.wgsl b/shader/path_tiling_setup.wgsl new file mode 100644 index 000000000..722be1914 --- /dev/null +++ b/shader/path_tiling_setup.wgsl @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Set up dispatch size for path tiling stage. + +#import bump + +@group(0) @binding(0) +var bump: BumpAllocators; + +@group(0) @binding(1) +var indirect: IndirectCount; + +// Partition size for path tiling stage +let WG_SIZE = 256u; + +@compute @workgroup_size(1) +fn main() { + let segments = atomicLoad(&bump.seg_counts); + indirect.count_x = (segments + (WG_SIZE - 1u)) / WG_SIZE; + indirect.count_y = 1u; + indirect.count_z = 1u; +} diff --git a/shader/shared/bump.wgsl b/shader/shared/bump.wgsl index 4864f3139..6c5789f15 100644 --- a/shader/shared/bump.wgsl +++ b/shader/shared/bump.wgsl @@ -6,13 +6,21 @@ let STAGE_TILE_ALLOC: u32 = 0x2u; let STAGE_PATH_COARSE: u32 = 0x4u; let STAGE_COARSE: u32 = 0x8u; -// This must be kept in sync with the struct in src/render.rs +// This must be kept in sync with the struct in config.rs in the encoding crate. struct BumpAllocators { // Bitmask of stages that have failed allocation. failed: atomic, binning: atomic, ptcl: atomic, tile: atomic, + seg_counts: atomic, segments: atomic, blend: atomic, + lines: atomic, +} + +struct IndirectCount { + count_x: u32, + count_y: u32, + count_z: u32, } diff --git a/shader/shared/config.wgsl b/shader/shared/config.wgsl index 4ac718c3e..1e553f9ae 100644 --- a/shader/shared/config.wgsl +++ b/shader/shared/config.wgsl @@ -48,6 +48,9 @@ let N_TILE_Y = 16u; //let N_TILE = N_TILE_X * N_TILE_Y; let N_TILE = 256u; +// Not currently supporting non-square tiles +let TILE_SCALE = 0.0625; + let BLEND_STACK_SPLIT = 4u; // The following are computed in draw_leaf from the generic gradient parameters diff --git a/shader/shared/ptcl.wgsl b/shader/shared/ptcl.wgsl index d5fc5d4fd..cf759a5b4 100644 --- a/shader/shared/ptcl.wgsl +++ b/shader/shared/ptcl.wgsl @@ -25,7 +25,8 @@ let CMD_JUMP = 11u; // hand in the relevant shaders struct CmdFill { - tile: u32, + size_and_rule: u32, + seg_data: u32, backdrop: i32, } diff --git a/shader/shared/segment.wgsl b/shader/shared/segment.wgsl index 7d0359262..9fe134478 100644 --- a/shader/shared/segment.wgsl +++ b/shader/shared/segment.wgsl @@ -1,8 +1,33 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense +// Segments laid out for contiguous storage struct Segment { origin: vec2, delta: vec2, y_edge: f32, - next: u32, +} + +// A line segment produced by flattening and ready for rasterization. +// +// The name is perhaps too playful, but reflects the fact that these +// lines are completely unordered. They will flow through coarse path +// rasterization, then the per-tile segments will be scatter-written into +// segment storage so that each (tile, path) tuple gets a contiguous +// slice of segments. +struct LineSoup { + path_ix: u32, + // Note: this creates an alignment gap. Don't worry about + // this now, but maybe move to scalars. + p0: vec2, + p1: vec2, +} + +// An intermediate data structure for sorting tile segments. +struct SegmentCount { + // Reference to element of LineSoup array + line_ix: u32, + // Two count values packed into a single u32 + // Lower 16 bits: index of segment within line + // Upper 16 bits: index of segment within segment slice + counts: u32, } diff --git a/src/lib.rs b/src/lib.rs index f45bdcf2b..55147acd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -251,7 +251,9 @@ impl Renderer { ) -> Result> { let mut render = Render::new(); let encoding = scene.data(); - let recording = render.render_encoding_coarse(encoding, &self.shaders, params, true); + // TODO: turn this on; the download feature interacts with CPU dispatch + let robust = false; + let recording = render.render_encoding_coarse(encoding, &self.shaders, params, robust); let target = render.out_image(); let bump_buf = render.bump_buf(); self.engine.run_recording( diff --git a/src/render.rs b/src/render.rs index 03e648446..462563633 100644 --- a/src/render.rs +++ b/src/render.rs @@ -190,17 +190,21 @@ impl Render { wg_counts.bbox_clear, [config_buf, path_bbox_buf], ); - let cubic_buf = - ResourceProxy::new_buf(buffer_sizes.cubics.size_in_bytes().into(), "cubic_buf"); + let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); + recording.clear_all(bump_buf); + let bump_buf = ResourceProxy::Buf(bump_buf); + let lines_buf = + ResourceProxy::new_buf(buffer_sizes.lines.size_in_bytes().into(), "lines_buf"); recording.dispatch( - shaders.pathseg, - wg_counts.path_seg, + shaders.flatten, + wg_counts.flatten, [ config_buf, scene_buf, tagmonoid_buf, path_bbox_buf, - cubic_buf, + bump_buf, + lines_buf, ], ); let draw_reduced_buf = ResourceProxy::new_buf( @@ -273,13 +277,10 @@ impl Render { buffer_sizes.draw_bboxes.size_in_bytes().into(), "draw_bbox_buf", ); - let bump_buf = BufProxy::new(buffer_sizes.bump_alloc.size_in_bytes().into(), "bump_buf"); let bin_header_buf = ResourceProxy::new_buf( buffer_sizes.bin_headers.size_in_bytes().into(), "bin_header_buf", ); - recording.clear_all(bump_buf); - let bump_buf = ResourceProxy::Buf(bump_buf); recording.dispatch( shaders.binning, wg_counts.binning, @@ -314,21 +315,33 @@ impl Render { ], ); recording.free_resource(draw_bbox_buf); + recording.free_resource(tagmonoid_buf); + let indirect_count_buf = BufProxy::new( + buffer_sizes.indirect_count.size_in_bytes().into(), + "indirect_count", + ); recording.dispatch( - shaders.path_coarse, - wg_counts.path_coarse, + shaders.path_count_setup, + (1, 1, 1), + [bump_buf, indirect_count_buf.into()], + ); + let seg_counts_buf = ResourceProxy::new_buf( + buffer_sizes.seg_counts.size_in_bytes().into(), + "seg_counts_buf", + ); + recording.dispatch_indirect( + shaders.path_count, + indirect_count_buf, + 0, [ config_buf, - scene_buf, - cubic_buf, - path_buf, bump_buf, + lines_buf, + path_buf, tile_buf, - segments_buf, + seg_counts_buf, ], ); - recording.free_resource(tagmonoid_buf); - recording.free_resource(cubic_buf); recording.dispatch( shaders.backdrop, wg_counts.backdrop, @@ -349,6 +362,27 @@ impl Render { ptcl_buf, ], ); + recording.dispatch( + shaders.path_tiling_setup, + (1, 1, 1), + [bump_buf, indirect_count_buf.into()], + ); + recording.dispatch_indirect( + shaders.path_tiling, + indirect_count_buf, + 0, + [ + bump_buf, + seg_counts_buf, + lines_buf, + path_buf, + tile_buf, + segments_buf, + ], + ); + recording.free_buf(indirect_count_buf); + recording.free_resource(seg_counts_buf); + recording.free_resource(lines_buf); recording.free_resource(scene_buf); recording.free_resource(draw_monoid_buf); recording.free_resource(bin_header_buf); diff --git a/src/shaders.rs b/src/shaders.rs index b55f158c1..23a3950f3 100644 --- a/src/shaders.rs +++ b/src/shaders.rs @@ -65,16 +65,19 @@ pub struct FullShaders { pub pathtag_scan: ShaderId, pub pathtag_scan_large: ShaderId, pub bbox_clear: ShaderId, - pub pathseg: ShaderId, + pub flatten: ShaderId, pub draw_reduce: ShaderId, pub draw_leaf: ShaderId, pub clip_reduce: ShaderId, pub clip_leaf: ShaderId, pub binning: ShaderId, pub tile_alloc: ShaderId, - pub path_coarse: ShaderId, pub backdrop: ShaderId, + pub path_count_setup: ShaderId, + pub path_count: ShaderId, pub coarse: ShaderId, + pub path_tiling_setup: ShaderId, + pub path_tiling: ShaderId, pub fine: ShaderId, } @@ -147,16 +150,17 @@ pub fn full_shaders( preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(), &[BindType::Uniform, BindType::Buffer], )?; - let pathseg = engine.add_shader( + let flatten = engine.add_shader( device, - "pathseg", - preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(), + "flatten", + preprocess::preprocess(shader!("flatten"), &full_config, &imports).into(), &[ BindType::Uniform, BindType::BufReadOnly, BindType::BufReadOnly, BindType::Buffer, BindType::Buffer, + BindType::Buffer, ], )?; let draw_reduce = engine.add_shader( @@ -232,17 +236,21 @@ pub fn full_shaders( BindType::Buffer, ], )?; - - let path_coarse = engine.add_shader( + let path_count_setup = engine.add_shader( + device, + "path_count_setup", + preprocess::preprocess(shader!("path_count_setup"), &empty, &imports).into(), + &[BindType::BufReadOnly, BindType::Buffer], + )?; + let path_count = engine.add_shader( device, - "path_coarse_full", - preprocess::preprocess(shader!("path_coarse_full"), &full_config, &imports).into(), + "path_count", + preprocess::preprocess(shader!("path_count"), &full_config, &imports).into(), &[ BindType::Uniform, + BindType::Buffer, BindType::BufReadOnly, BindType::BufReadOnly, - BindType::BufReadOnly, - BindType::Buffer, BindType::Buffer, BindType::Buffer, ], @@ -264,9 +272,28 @@ pub fn full_shaders( BindType::BufReadOnly, BindType::BufReadOnly, BindType::BufReadOnly, - BindType::BufReadOnly, BindType::Buffer, BindType::Buffer, + BindType::Buffer, + ], + )?; + let path_tiling_setup = engine.add_shader( + device, + "path_tiling_setup", + preprocess::preprocess(shader!("path_tiling_setup"), &empty, &imports).into(), + &[BindType::BufReadOnly, BindType::Buffer], + )?; + let path_tiling = engine.add_shader( + device, + "path_tiling", + preprocess::preprocess(shader!("path_tiling"), &empty, &imports).into(), + &[ + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Buffer, ], )?; let fine = engine.add_shader( @@ -290,16 +317,19 @@ pub fn full_shaders( pathtag_scan1, pathtag_scan_large, bbox_clear, - pathseg, + flatten, draw_reduce, draw_leaf, clip_reduce, clip_leaf, binning, tile_alloc, - path_coarse, + path_count_setup, + path_count, backdrop, coarse, + path_tiling_setup, + path_tiling, fine, }) }