Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: extract comments from source #161

Merged
merged 3 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/semantic/Semantic.zig
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ ast: Ast, // NOTE: allocated in _arena
// time. So we do it once, eat the memory overhead, and help the linter avoid
// constant re-tokenization.
// NOTE: allocated in _arena
tokens: TokenList,
tokens: TokenList.Slice,
node_links: NodeLinks,
comments: CommentList.Slice,
_gpa: Allocator,
/// Used to allocate AST nodes
_arena: ArenaAllocator,
Expand Down Expand Up @@ -98,7 +99,9 @@ const Type = std.builtin.Type;
const assert = std.debug.assert;

const _ast = @import("./ast.zig");
const TokenList = _ast.TokenList;
const _tokenizer = @import("./tokenizer.zig");
const TokenList = _tokenizer.TokenList;
const CommentList = _tokenizer.CommentList;
const TokenIndex = _ast.TokenIndex;

pub const NodeLinks = @import("NodeLinks.zig");
Expand Down
52 changes: 33 additions & 19 deletions src/semantic/SemanticBuilder.zig
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ pub fn withSource(self: *SemanticBuilder, source: *const _source.Source) void {
pub fn build(builder: *SemanticBuilder, source: stringSlice) SemanticError!Result {
// NOTE: ast is moved
const gpa = builder._gpa;
const tokens = try builder.tokenize(source);
const token_bundle = try tokenizer.tokenize(
builder._arena.allocator(),
source,
);

const ast = try builder.parse(source);
const node_links = try NodeLinks.init(gpa, &ast);
assert(ast.nodes.len == node_links.parents.items.len);
Expand All @@ -106,14 +110,22 @@ pub fn build(builder: *SemanticBuilder, source: stringSlice) SemanticError!Resul
try builder._node_stack.ensureTotalCapacity(gpa, @max(ast.nodes.len, 32) >> 2);

builder._semantic = Semantic{
.tokens = tokens,
.tokens = token_bundle.tokens,
.ast = ast,
.node_links = node_links,
.comments = token_bundle.comments,
._arena = builder._arena,
._gpa = gpa,
};
errdefer builder._semantic.deinit();

// TODO: collect data and approximate #symbols declared vs. #identifiers encountered
// TODO: benchmark analysis with and without this
try builder._semantic.symbols.symbols.ensureTotalCapacity(
builder._gpa,
token_bundle.stats.identifiers >> 1,
);

// Create root scope & symbol and push them onto their stacks. Also
// pushes the root node. None of these are ever popped.
try builder.enterRoot();
Expand Down Expand Up @@ -173,25 +185,25 @@ fn parse(self: *SemanticBuilder, source: stringSlice) Allocator.Error!Ast {
return ast;
}

fn tokenize(self: *SemanticBuilder, source: stringSlice) Allocator.Error!TokenList {
const alloc = self._arena.allocator();
// fn tokenize(self: *SemanticBuilder, source: stringSlice) Allocator.Error!TokenList {
// const alloc = self._arena.allocator();

var tokens = std.MultiArrayList(Token){};
errdefer tokens.deinit(alloc);
// var tokens = std.MultiArrayList(Token){};
// errdefer tokens.deinit(alloc);

// Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
const estimated_token_count = source.len / 8;
try tokens.ensureTotalCapacity(alloc, estimated_token_count);
// // Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
// const estimated_token_count = source.len / 8;
// try tokens.ensureTotalCapacity(alloc, estimated_token_count);

var tokenizer = std.zig.Tokenizer.init(source);
// var tokenizer = std.zig.Tokenizer.init(source);

while (true) {
const token = tokenizer.next();
try tokens.append(alloc, token);
if (token.tag == .eof) break;
}
return tokens.slice();
}
// while (true) {
// const token = tokenizer.next();
// try tokens.append(alloc, token);
// if (token.tag == .eof) break;
// }
// return tokens.slice();
// }

// =========================================================================
// ================================= VISIT =================================
Expand Down Expand Up @@ -1819,11 +1831,13 @@ const Type = std.builtin.Type;

const assert = std.debug.assert;

const tokenizer = @import("tokenizer.zig");
const Token = tokenizer.Token;
const TokenList = tokenizer.TokenList;

const _ast = @import("ast.zig");
const Ast = _ast.Ast;
const full = Ast.full;
const Token = _ast.Token;
const TokenList = _ast.TokenList;
const Node = _ast.Node;
const NodeIndex = _ast.NodeIndex;
const RawToken = _ast.RawToken;
Expand Down
3 changes: 0 additions & 3 deletions src/semantic/ast.zig
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ const NominalId = @import("id.zig").NominalId;

pub const Ast = std.zig.Ast;
pub const Node = Ast.Node;
pub const Token = std.zig.Token;

/// The struct used in AST tokens SOA is not pub so we hack it in here.
pub const RawToken = struct {
Expand All @@ -15,8 +14,6 @@ pub const RawToken = struct {
pub const Tag = std.zig.Token.Tag;
};

pub const TokenList = std.MultiArrayList(Token).Slice;

pub const TokenIndex = Ast.TokenIndex;
pub const NodeIndex = Node.Index;
pub const MaybeTokenId = NominalId(Ast.TokenIndex).Optional;
207 changes: 207 additions & 0 deletions src/semantic/tokenizer.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
const std = @import("std");
const util = @import("util");
const _ast = @import("./ast.zig");
const span = @import("../span.zig");

const Allocator = std.mem.Allocator;
const Span = span.Span;

pub const Token = std.zig.Token;
pub const TokenList = std.MultiArrayList(Token);
pub const CommentList = std.MultiArrayList(Span);

pub const TokenBundle = struct {
tokens: TokenList.Slice,
/// Doc and "normal" comments found in the tokenized source.
///
/// Comments are always sorted by their position within the source. That is,
/// ```
/// \forall i \st i < comments.len-1 | comments[i] < comments[i+1]
/// ```
comments: CommentList.Slice,
stats: Stats,

const Stats = struct {
/// Number of identifier tokens encountered.
identifiers: u32 = 0,
};
};

/// Tokenize Zig source code.
///
/// Copied + modified from `Ast.parse`. We tokenize and keep our own copy of the
/// token list because Ast discards token end positions. Lacking this,
/// `ast.tokenSlice` requires re-tokenization on each call for (e.g.) identifier
/// tokens.
///
/// Zig's tokenizer also discards comments completely. We need this for (e.g.)
/// disable directives, so we need to store them ourselves.
pub fn tokenize(
// Should be an arena
allocator: Allocator,
source: [:0]const u8,
) Allocator.Error!TokenBundle {
var tokens = TokenList{};
var comments = CommentList{};
var stats: TokenBundle.Stats = .{};
errdefer {
tokens.deinit(allocator);
comments.deinit(allocator);
}

// Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
const estimated_token_count = source.len / 8;
try tokens.ensureTotalCapacity(allocator, estimated_token_count);
// TODO: collect data and find the best starting capacity
try comments.ensureTotalCapacity(allocator, 16);

var tokenizer = std.zig.Tokenizer.init(source);

var prev_end: u32 = 0;
while (true) {
@setRuntimeSafety(true);
const token = tokenizer.next();
try scanForComments(allocator, source[prev_end..token.loc.start], prev_end, &comments);
try tokens.append(allocator, token);
util.assert(token.loc.end < std.math.maxInt(u32), "token exceeds u32 limit", .{});
prev_end = @truncate(token.loc.end);
switch (token.tag) {
.identifier => stats.identifiers += 1,
.eof => break,
else => {},
}
}

return .{
.tokens = tokens.slice(),
.comments = comments.slice(),
.stats = stats,
};
}

/// Scan the gap between two tokens for a comment.
/// There will only ever be 0 or 1 comment between two tokens.
fn scanForComments(
allocator: Allocator,
// source slice between two tokens
source_between: []const u8,
offset: u32,
comments: *CommentList,
) Allocator.Error!void {
var cursor: u32 = 0;
// consecutive slashes seen
var slashes_seen: u8 = 0;
var in_comment_line = false;
var start: ?u32 = null;
while (cursor < source_between.len) : (cursor += 1) {
const c = source_between[cursor];
switch (c) {
' ' | '\t' => continue,
'/' => {
// careful not to overflow if, e.g. `///////////////////` (etc.)
if (!in_comment_line) slashes_seen += 1;
if (!in_comment_line and slashes_seen >= 2) {
in_comment_line = true;
// may have more than one line comment in a row
if (start == null) start = (cursor + 1) - slashes_seen;
}
},
'\n' => {
if (in_comment_line) {
try comments.append(allocator, Span.new(start.? + offset, cursor + offset));
in_comment_line = false;
start = null;
slashes_seen = 0;
// may have more than one line comment in a row, so keep
// walking
}
},
else => continue,
}
}
// Happens when EOF is reached before a newline
if (in_comment_line and start != null) {
try comments.append(allocator, Span.new(start.? + offset, cursor + offset));
}
}

const t = std.testing;
test scanForComments {
var comments: CommentList = .{};
defer comments.deinit(t.allocator);

{
defer comments.len = 0;
const simple = "// foo";
try scanForComments(t.allocator, simple, 0, &comments);
try t.expectEqual(1, comments.len);
try t.expectEqual(comments.get(0), Span{ .start = 0, .end = simple.len });
}

// doc comments
{
defer comments.len = 0;
const source = "/// foo";
try scanForComments(t.allocator, source, 0, &comments);
try t.expectEqual(1, comments.len);
try t.expectEqual(comments.get(0), Span{ .start = 0, .end = source.len });
}
{
defer comments.len = 0;
const source = "//! foo";
try scanForComments(t.allocator, source, 0, &comments);
try t.expectEqual(1, comments.len);
try t.expectEqual(comments.get(0), Span{ .start = 0, .end = source.len });
}

// weird comments
{
defer comments.len = 0;
const multi_slash = "//////////foo//";
try scanForComments(t.allocator, multi_slash, 0, &comments);
try t.expectEqual(1, comments.len);
try t.expectEqual(comments.get(0), Span{ .start = 0, .end = multi_slash.len });
}

// multiple comments
{
defer comments.len = 0;
const source =
\\// foo
\\// bar
;
try scanForComments(t.allocator, source, 0, &comments);
try t.expectEqual(2, comments.len);
try t.expectEqual(comments.get(0), Span{ .start = 0, .end = 6 });
try t.expectEqual(comments.get(1), Span{ .start = 7, .end = source.len });
try t.expectEqual(source[6], '\n');
try t.expectEqual(source[7], '/');
}
}

test "Comments are always sorted by their position within source code" {
const src =
\\//! foo
\\//! bar
\\pub fn foo() u32 { // a thing
\\ const x = 1;
\\ // another thing
\\ return x;
\\}
\\
\\// a comment
\\const x = 1;
;
var arena = std.heap.ArenaAllocator.init(t.allocator);
defer arena.deinit();
const bundle = try tokenize(arena.allocator(), src);

var prev = bundle.comments.get(0);
try t.expect(prev.start <= prev.end);

for (1..bundle.comments.len) |i| {
const curr = bundle.comments.get(i);
try t.expect(prev.end < curr.start);
prev = curr;
}
}
4 changes: 4 additions & 0 deletions src/span.zig
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ pub const Span = struct {
assert(self.end >= self.start);
return contents[self.start..self.end];
}

pub fn eql(self: Span, other: Span) bool {
return self.start == other.start and self.end == other.end;
}
};

pub const LabeledSpan = struct {
Expand Down
Loading