Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the outer lexer loop. #3140

Merged
merged 7 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions toolchain/lexer/token_kind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ auto TokenKind::opening_symbol() const -> TokenKind {
return result;
}

auto TokenKind::is_one_char_symbol() const -> bool {
static constexpr bool Table[] = {
#define CARBON_TOKEN(TokenName) false,
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) true,
#include "toolchain/lexer/token_kind.def"
};
return Table[AsInt()];
}

auto TokenKind::is_keyword() const -> bool {
static constexpr bool Table[] = {
#define CARBON_TOKEN(TokenName) false,
Expand Down
22 changes: 18 additions & 4 deletions toolchain/lexer/token_kind.def
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@
#define CARBON_SYMBOL_TOKEN(Name, Spelling) CARBON_TOKEN(Name)
#endif

#ifndef CARBON_ONE_CHAR_SYMBOL_TOKEN
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling) \
CARBON_SYMBOL_TOKEN(Name, Spelling)
#endif

#ifndef CARBON_TOKEN_WITH_VIRTUAL_NODE
#define CARBON_TOKEN_WITH_VIRTUAL_NODE(Name) Name
#endif
Expand Down Expand Up @@ -78,7 +83,6 @@ CARBON_SYMBOL_TOKEN(At, "@")
CARBON_SYMBOL_TOKEN(Backslash, "\\")
CARBON_SYMBOL_TOKEN(Caret, "^")
CARBON_SYMBOL_TOKEN(Colon, ":")
CARBON_SYMBOL_TOKEN(Comma, ",")
CARBON_SYMBOL_TOKEN(Equal, "=")
CARBON_SYMBOL_TOKEN(Exclaim, "!")
CARBON_SYMBOL_TOKEN(Greater, ">")
Expand All @@ -89,16 +93,25 @@ CARBON_SYMBOL_TOKEN(Period, ".")
CARBON_SYMBOL_TOKEN(Pipe, "|")
CARBON_SYMBOL_TOKEN(Plus, "+")
CARBON_SYMBOL_TOKEN(Question, "?")
CARBON_SYMBOL_TOKEN(Semi, ";")
CARBON_SYMBOL_TOKEN(Slash, "/")
CARBON_SYMBOL_TOKEN(Star, "*")
CARBON_SYMBOL_TOKEN(Tilde, "~")

// Some Carbon symbols are constructively exactly one character and cannot be
// combined with any other characters to form new symbols. We can lex these
// without needing to max-munch any other characters. These are typically
// expected to be terminators or separators that need to compose with all other
// parts of the grammar. Group symbols are also currently one-character symbols,
// although we may choose to remove that if we need to add composite grouping
// symbols in the future.
CARBON_ONE_CHAR_SYMBOL_TOKEN(Comma, ",")
CARBON_ONE_CHAR_SYMBOL_TOKEN(Semi, ";")

// clang-format on

#ifndef CARBON_OPENING_GROUP_SYMBOL_TOKEN
#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(Name, Spelling, ClosingName) \
CARBON_SYMBOL_TOKEN(Name, Spelling)
CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling)
#endif
CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenParen, "(", CloseParen)
CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenCurlyBrace, "{", CloseCurlyBrace)
Expand All @@ -107,13 +120,14 @@ CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenSquareBracket, "[", CloseSquareBracket)

#ifndef CARBON_CLOSING_GROUP_SYMBOL_TOKEN
#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(Name, Spelling, OpeningName) \
CARBON_SYMBOL_TOKEN(Name, Spelling)
CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling)
#endif
CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseParen, ")", OpenParen)
CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseCurlyBrace, "}", OpenCurlyBrace)
CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseSquareBracket, "]", OpenSquareBracket)
#undef CARBON_CLOSING_GROUP_SYMBOL_TOKEN

#undef CARBON_ONE_CHAR_SYMBOL_TOKEN
#undef CARBON_SYMBOL_TOKEN

#ifndef CARBON_KEYWORD_TOKEN
Expand Down
4 changes: 4 additions & 0 deletions toolchain/lexer/token_kind.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class TokenKind : public CARBON_ENUM_BASE(TokenKind) {
// The token kind must be a closing symbol.
[[nodiscard]] auto opening_symbol() const -> TokenKind;

// Test whether this kind of token is a one-character symbol whose character
// is not part of any other symbol.
[[nodiscard]] auto is_one_char_symbol() const -> bool;

// Test whether this kind of token is a keyword.
[[nodiscard]] auto is_keyword() const -> bool;

Expand Down
159 changes: 134 additions & 25 deletions toolchain/lexer/tokenized_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common/check.h"
#include "common/string_helpers.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ErrorHandling.h"
Expand Down Expand Up @@ -247,6 +248,10 @@ class TokenizedBuffer::Lexer {
bool formed_token_;
};

using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef& source_text)
-> LexResult;
using DispatchTableT = std::array<DispatchFunctionT*, 256>;

Lexer(TokenizedBuffer& buffer, DiagnosticConsumer& consumer)
: buffer_(&buffer),
translator_(&buffer),
Expand Down Expand Up @@ -351,7 +356,7 @@ class TokenizedBuffer::Lexer {
std::optional<LexedNumericLiteral> literal =
LexedNumericLiteral::Lex(source_text);
if (!literal) {
return LexResult::NoMatch();
return LexError(source_text);
}

int int_column = current_column_;
Expand Down Expand Up @@ -402,7 +407,7 @@ class TokenizedBuffer::Lexer {
std::optional<LexedStringLiteral> literal =
LexedStringLiteral::Lex(source_text);
if (!literal) {
return LexResult::NoMatch();
return LexError(source_text);
}

Line string_line = current_line_;
Expand Down Expand Up @@ -453,14 +458,30 @@ class TokenizedBuffer::Lexer {
}
}

auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
auto LexSymbolToken(llvm::StringRef& source_text,
TokenKind kind = TokenKind::Error) -> LexResult {
auto compute_symbol_kind = [](llvm::StringRef source_text) {
return llvm::StringSwitch<TokenKind>(source_text)
#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
.StartsWith(Spelling, TokenKind::Name)
#include "toolchain/lexer/token_kind.def"
.Default(TokenKind::Error);
if (kind == TokenKind::Error) {
return LexResult::NoMatch();
.Default(TokenKind::Error);
};

// We use the `error` token as a place-holder for cases where one character
// isn't enough to pick a definitive symbol token. Recompute the kind using
// the full symbol set.
if (LLVM_UNLIKELY(kind == TokenKind::Error)) {
kind = compute_symbol_kind(source_text);
if (kind == TokenKind::Error) {
return LexError(source_text);
}
} else {
// Verify in a debug build that the incoming token kind is correct.
CARBON_DCHECK(kind == compute_symbol_kind(source_text))
<< "Incoming token kind '" << kind
<< "' does not match computed kind '"
<< compute_symbol_kind(source_text) << "'!";
}

if (!set_indent_) {
Expand Down Expand Up @@ -609,9 +630,11 @@ class TokenizedBuffer::Lexer {
}

auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
if (!IsAlpha(source_text.front()) && source_text.front() != '_') {
return LexResult::NoMatch();
if (static_cast<unsigned char>(source_text.front()) > 0x7F) {
// TODO: Need to add support for Unicode lexing.
return LexError(source_text);
}
CARBON_CHECK(IsAlpha(source_text.front()) || source_text.front() == '_');

if (!set_indent_) {
current_line_info_->indent = current_column_;
Expand Down Expand Up @@ -692,6 +715,76 @@ class TokenizedBuffer::Lexer {
.column = current_column_});
}

constexpr static auto MakeDispatchTable() -> DispatchTableT {
DispatchTableT table = {};
auto dispatch_lex_error = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexError(source_text);
};
for (int i = 0; i < 256; ++i) {
table[i] = dispatch_lex_error;
}

// Symbols have some special dispatching. First, set the first character of
// each symbol token spelling to dispatch to the symbol lexer. We don't
// provide a pre-computed token here, so the symbol lexer will compute the
// exact symbol token kind.
auto dispatch_lex_symbol = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexSymbolToken(source_text);
};
#define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = dispatch_lex_symbol;
#include "toolchain/lexer/token_kind.def"

// Now special cased single-character symbols that are guaranteed to not
// join with another symbol. These are grouping symbols, terminators,
// or separators in the grammar and have a good reason to be
// orthogonal to any other punctuation. We do this separately because this
// needs to override some of the generic handling above, and provide a
// custom token.
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = +[](Lexer& lexer, llvm::StringRef& source_text) { \
return lexer.LexSymbolToken(source_text, TokenKind::TokenName); \
};
#include "toolchain/lexer/token_kind.def"

auto dispatch_lex_word = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexKeywordOrIdentifier(source_text);
};
table['_'] = dispatch_lex_word;
// Note that we don't use `llvm::seq` because this needs to be `constexpr`
// evaluated.
for (unsigned char c = 'a'; c <= 'z'; ++c) {
table[c] = dispatch_lex_word;
}
for (unsigned char c = 'A'; c <= 'Z'; ++c) {
table[c] = dispatch_lex_word;
}
// We dispatch all non-ASCII UTF-8 characters to the identifier lexing
// as whitespace characters should already have been skipped and the
// only remaining valid Unicode characters would be part of an
// identifier. That code can either accept or reject.
for (int i = 0x80; i < 0x100; ++i) {
table[i] = dispatch_lex_word;
}

auto dispatch_lex_numeric =
+[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexNumericLiteral(source_text);
};
for (unsigned char c = '0'; c <= '9'; ++c) {
table[c] = dispatch_lex_numeric;
}

auto dispatch_lex_string = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexStringLiteral(source_text);
};
table['\''] = dispatch_lex_string;
table['"'] = dispatch_lex_string;
table['#'] = dispatch_lex_string;

return table;
};

private:
TokenizedBuffer* buffer_;

Expand All @@ -716,24 +809,40 @@ auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
ErrorTrackingDiagnosticConsumer error_tracking_consumer(consumer);
Lexer lexer(buffer, error_tracking_consumer);

// Build a table of function pointers that we can use to dispatch to the
// correct lexer routine based on the first byte of source text.
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
//
// While it is tempting to simply use a `switch` on the first byte and
// dispatch with cases into this, in practice that doesn't produce great code.
// There seem to be two issues that are the root cause.
//
// First, there are lots of different values of bytes that dispatch to a
// fairly small set of routines, and then some byte values that dispatch
// differently for each byte. This pattern isn't one that the compiler-based
// lowering of switches works well with -- it tries to balance all the cases,
// and in doing so emits several compares and other control flow rather than a
// simple jump table.
//
// Second, with a `case`, it isn't as obvious how to create a single, uniform
// interface that is effective for *every* byte value, and thus makes for a
// single consistent table-based dispatch. By forcing these to be function
// pointers, we also coerce the code to use a strictly homogeneous structure
// that can form a single dispatch table.
//
// These two actually interact -- the second issue is part of what makes the
// non-table lowering in the first one desirable for many switches and cases.
//
// Ultimately, when table-based dispatch is such an important technique, we
// get better results by taking full control and manually creating the
// dispatch structures.
constexpr Lexer::DispatchTableT DispatchTable = Lexer::MakeDispatchTable();

llvm::StringRef source_text = source.text();
while (lexer.SkipWhitespace(source_text)) {
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
// Each time we find non-whitespace characters, try each kind of token we
// support lexing, from simplest to most complex.
Lexer::LexResult result = lexer.LexSymbolToken(source_text);
if (!result) {
result = lexer.LexKeywordOrIdentifier(source_text);
}
if (!result) {
result = lexer.LexNumericLiteral(source_text);
}
if (!result) {
result = lexer.LexStringLiteral(source_text);
}
if (!result) {
result = lexer.LexError(source_text);
}
CARBON_CHECK(result) << "No token was lexed.";
Lexer::LexResult result =
DispatchTable[static_cast<unsigned char>(source_text.front())](
lexer, source_text);
CARBON_CHECK(result) << "Failed to form a token!";
}

// The end-of-file token is always considered to be whitespace.
Expand Down
Loading