DonIsaac · DonIsaac · Dec 19, 2024 · Dec 17, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/src/semantic/Semantic.zig b/src/semantic/Semantic.zig
@@ -21,8 +21,9 @@ ast: Ast, // NOTE: allocated in _arena
 // time. So we do it once, eat the memory overhead, and help the linter avoid
 // constant re-tokenization.
 // NOTE: allocated in _arena
-tokens: TokenList,
+tokens: TokenList.Slice,
 node_links: NodeLinks,
+comments: CommentList.Slice,
 _gpa: Allocator,
 /// Used to allocate AST nodes
 _arena: ArenaAllocator,
@@ -98,7 +99,9 @@ const Type = std.builtin.Type;
 const assert = std.debug.assert;
 
 const _ast = @import("./ast.zig");
-const TokenList = _ast.TokenList;
+const _tokenizer = @import("./tokenizer.zig");
+const TokenList = _tokenizer.TokenList;
+const CommentList = _tokenizer.CommentList;
 const TokenIndex = _ast.TokenIndex;
 
 pub const NodeLinks = @import("NodeLinks.zig");

diff --git a/src/semantic/SemanticBuilder.zig b/src/semantic/SemanticBuilder.zig
@@ -92,7 +92,11 @@ pub fn withSource(self: *SemanticBuilder, source: *const _source.Source) void {
 pub fn build(builder: *SemanticBuilder, source: stringSlice) SemanticError!Result {
     // NOTE: ast is moved
     const gpa = builder._gpa;
-    const tokens = try builder.tokenize(source);
+    const token_bundle = try tokenizer.tokenize(
+        builder._arena.allocator(),
+        source,
+    );
+
     const ast = try builder.parse(source);
     const node_links = try NodeLinks.init(gpa, &ast);
     assert(ast.nodes.len == node_links.parents.items.len);
@@ -106,14 +110,22 @@ pub fn build(builder: *SemanticBuilder, source: stringSlice) SemanticError!Resul
     try builder._node_stack.ensureTotalCapacity(gpa, @max(ast.nodes.len, 32) >> 2);
 
     builder._semantic = Semantic{
-        .tokens = tokens,
+        .tokens = token_bundle.tokens,
         .ast = ast,
         .node_links = node_links,
+        .comments = token_bundle.comments,
         ._arena = builder._arena,
         ._gpa = gpa,
     };
     errdefer builder._semantic.deinit();
 
+    // TODO: collect data and approximate #symbols declared vs. #identifiers encountered
+    // TODO: benchmark analysis with and without this
+    try builder._semantic.symbols.symbols.ensureTotalCapacity(
+        builder._gpa,
+        token_bundle.stats.identifiers >> 1,
+    );
+
     // Create root scope & symbol and push them onto their stacks. Also
     // pushes the root node. None of these are ever popped.
     try builder.enterRoot();
@@ -173,25 +185,25 @@ fn parse(self: *SemanticBuilder, source: stringSlice) Allocator.Error!Ast {
     return ast;
 }
 
-fn tokenize(self: *SemanticBuilder, source: stringSlice) Allocator.Error!TokenList {
-    const alloc = self._arena.allocator();
+// fn tokenize(self: *SemanticBuilder, source: stringSlice) Allocator.Error!TokenList {
+//     const alloc = self._arena.allocator();
 
-    var tokens = std.MultiArrayList(Token){};
-    errdefer tokens.deinit(alloc);
+//     var tokens = std.MultiArrayList(Token){};
+//     errdefer tokens.deinit(alloc);
 
-    // Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
-    const estimated_token_count = source.len / 8;
-    try tokens.ensureTotalCapacity(alloc, estimated_token_count);
+//     // Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
+//     const estimated_token_count = source.len / 8;
+//     try tokens.ensureTotalCapacity(alloc, estimated_token_count);
 
-    var tokenizer = std.zig.Tokenizer.init(source);
+//     var tokenizer = std.zig.Tokenizer.init(source);
 
-    while (true) {
-        const token = tokenizer.next();
-        try tokens.append(alloc, token);
-        if (token.tag == .eof) break;
-    }
-    return tokens.slice();
-}
+//     while (true) {
+//         const token = tokenizer.next();
+//         try tokens.append(alloc, token);
+//         if (token.tag == .eof) break;
+//     }
+//     return tokens.slice();
+// }
 
 // =========================================================================
 // ================================= VISIT =================================
@@ -1819,11 +1831,13 @@ const Type = std.builtin.Type;
 
 const assert = std.debug.assert;
 
+const tokenizer = @import("tokenizer.zig");
+const Token = tokenizer.Token;
+const TokenList = tokenizer.TokenList;
+
 const _ast = @import("ast.zig");
 const Ast = _ast.Ast;
 const full = Ast.full;
-const Token = _ast.Token;
-const TokenList = _ast.TokenList;
 const Node = _ast.Node;
 const NodeIndex = _ast.NodeIndex;
 const RawToken = _ast.RawToken;

diff --git a/src/semantic/ast.zig b/src/semantic/ast.zig
@@ -6,7 +6,6 @@ const NominalId = @import("id.zig").NominalId;
 
 pub const Ast = std.zig.Ast;
 pub const Node = Ast.Node;
-pub const Token = std.zig.Token;
 
 /// The struct used in AST tokens SOA is not pub so we hack it in here.
 pub const RawToken = struct {
@@ -15,8 +14,6 @@ pub const RawToken = struct {
     pub const Tag = std.zig.Token.Tag;
 };
 
-pub const TokenList = std.MultiArrayList(Token).Slice;
-
 pub const TokenIndex = Ast.TokenIndex;
 pub const NodeIndex = Node.Index;
 pub const MaybeTokenId = NominalId(Ast.TokenIndex).Optional;
diff --git a/src/semantic/tokenizer.zig b/src/semantic/tokenizer.zig
@@ -0,0 +1,207 @@
+const std = @import("std");
+const util = @import("util");
+const _ast = @import("./ast.zig");
+const span = @import("../span.zig");
+
+const Allocator = std.mem.Allocator;
+const Span = span.Span;
+
+pub const Token = std.zig.Token;
+pub const TokenList = std.MultiArrayList(Token);
+pub const CommentList = std.MultiArrayList(Span);
+
+pub const TokenBundle = struct {
+    tokens: TokenList.Slice,
+    /// Doc and "normal" comments found in the tokenized source.
+    ///
+    /// Comments are always sorted by their position within the source. That is,
+    /// ```
+    /// \forall i \st i < comments.len-1 | comments[i] < comments[i+1]
+    /// ```
+    comments: CommentList.Slice,
+    stats: Stats,
+
+    const Stats = struct {
+        /// Number of identifier tokens encountered.
+        identifiers: u32 = 0,
+    };
+};
+
+/// Tokenize Zig source code.
+///
+/// Copied + modified from `Ast.parse`. We tokenize and keep our own copy of the
+/// token list because Ast discards token end positions. Lacking this,
+/// `ast.tokenSlice` requires re-tokenization on each call for (e.g.) identifier
+/// tokens.
+///
+/// Zig's tokenizer also discards comments completely. We need this for (e.g.)
+/// disable directives, so we need to store them ourselves.
+pub fn tokenize(
+    // Should be an arena
+    allocator: Allocator,
+    source: [:0]const u8,
+) Allocator.Error!TokenBundle {
+    var tokens = TokenList{};
+    var comments = CommentList{};
+    var stats: TokenBundle.Stats = .{};
+    errdefer {
+        tokens.deinit(allocator);
+        comments.deinit(allocator);
+    }
+
+    // Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
+    const estimated_token_count = source.len / 8;
+    try tokens.ensureTotalCapacity(allocator, estimated_token_count);
+    // TODO: collect data and find the best starting capacity
+    try comments.ensureTotalCapacity(allocator, 16);
+
+    var tokenizer = std.zig.Tokenizer.init(source);
+
+    var prev_end: u32 = 0;
+    while (true) {
+        @setRuntimeSafety(true);
+        const token = tokenizer.next();
+        try scanForComments(allocator, source[prev_end..token.loc.start], prev_end, &comments);
+        try tokens.append(allocator, token);
+        util.assert(token.loc.end < std.math.maxInt(u32), "token exceeds u32 limit", .{});
+        prev_end = @truncate(token.loc.end);
+        switch (token.tag) {
+            .identifier => stats.identifiers += 1,
+            .eof => break,
+            else => {},
+        }
+    }
+
+    return .{
+        .tokens = tokens.slice(),
+        .comments = comments.slice(),
+        .stats = stats,
+    };
+}
+
+/// Scan the gap between two tokens for a comment.
+/// There will only ever be 0 or 1 comment between two tokens.
+fn scanForComments(
+    allocator: Allocator,
+    // source slice between two tokens
+    source_between: []const u8,
+    offset: u32,
+    comments: *CommentList,
+) Allocator.Error!void {
+    var cursor: u32 = 0;
+    // consecutive slashes seen
+    var slashes_seen: u8 = 0;
+    var in_comment_line = false;
+    var start: ?u32 = null;
+    while (cursor < source_between.len) : (cursor += 1) {
+        const c = source_between[cursor];
+        switch (c) {
+            ' ' | '\t' => continue,
+            '/' => {
+                // careful not to overflow if, e.g. `///////////////////` (etc.)
+                if (!in_comment_line) slashes_seen += 1;
+                if (!in_comment_line and slashes_seen >= 2) {
+                    in_comment_line = true;
+                    // may have more than one line comment in a row
+                    if (start == null) start = (cursor + 1) - slashes_seen;
+                }
+            },
+            '\n' => {
+                if (in_comment_line) {
+                    try comments.append(allocator, Span.new(start.? + offset, cursor + offset));
+                    in_comment_line = false;
+                    start = null;
+                    slashes_seen = 0;
+                    // may have more than one line comment in a row, so keep
+                    // walking
+                }
+            },
+            else => continue,
+        }
+    }
+    // Happens when EOF is reached before a newline
+    if (in_comment_line and start != null) {
+        try comments.append(allocator, Span.new(start.? + offset, cursor + offset));
+    }
+}
+
+const t = std.testing;
+test scanForComments {
+    var comments: CommentList = .{};
+    defer comments.deinit(t.allocator);
+
+    {
+        defer comments.len = 0;
+        const simple = "// foo";
+        try scanForComments(t.allocator, simple, 0, &comments);
+        try t.expectEqual(1, comments.len);
+        try t.expectEqual(comments.get(0), Span{ .start = 0, .end = simple.len });
+    }
+
+    // doc comments
+    {
+        defer comments.len = 0;
+        const source = "/// foo";
+        try scanForComments(t.allocator, source, 0, &comments);
+        try t.expectEqual(1, comments.len);
+        try t.expectEqual(comments.get(0), Span{ .start = 0, .end = source.len });
+    }
+    {
+        defer comments.len = 0;
+        const source = "//! foo";
+        try scanForComments(t.allocator, source, 0, &comments);
+        try t.expectEqual(1, comments.len);
+        try t.expectEqual(comments.get(0), Span{ .start = 0, .end = source.len });
+    }
+
+    // weird comments
+    {
+        defer comments.len = 0;
+        const multi_slash = "//////////foo//";
+        try scanForComments(t.allocator, multi_slash, 0, &comments);
+        try t.expectEqual(1, comments.len);
+        try t.expectEqual(comments.get(0), Span{ .start = 0, .end = multi_slash.len });
+    }
+
+    // multiple comments
+    {
+        defer comments.len = 0;
+        const source =
+            \\// foo
+            \\// bar
+        ;
+        try scanForComments(t.allocator, source, 0, &comments);
+        try t.expectEqual(2, comments.len);
+        try t.expectEqual(comments.get(0), Span{ .start = 0, .end = 6 });
+        try t.expectEqual(comments.get(1), Span{ .start = 7, .end = source.len });
+        try t.expectEqual(source[6], '\n');
+        try t.expectEqual(source[7], '/');
+    }
+}
+
+test "Comments are always sorted by their position within source code" {
+    const src =
+        \\//! foo
+        \\//! bar
+        \\pub fn foo() u32 { // a thing
+        \\  const x = 1;
+        \\  // another thing
+        \\  return x;
+        \\}
+        \\
+        \\// a comment
+        \\const x = 1;
+    ;
+    var arena = std.heap.ArenaAllocator.init(t.allocator);
+    defer arena.deinit();
+    const bundle = try tokenize(arena.allocator(), src);
+
+    var prev = bundle.comments.get(0);
+    try t.expect(prev.start <= prev.end);
+
+    for (1..bundle.comments.len) |i| {
+        const curr = bundle.comments.get(i);
+        try t.expect(prev.end < curr.start);
+        prev = curr;
+    }
+}
diff --git a/src/span.zig b/src/span.zig
@@ -64,6 +64,10 @@ pub const Span = struct {
         assert(self.end >= self.start);
         return contents[self.start..self.end];
     }
+
+    pub fn eql(self: Span, other: Span) bool {
+        return self.start == other.start and self.end == other.end;
+    }
 };
 
 pub const LabeledSpan = struct {