Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the outer lexer loop. #3140

Merged
merged 7 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions toolchain/lexer/token_kind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ auto TokenKind::opening_symbol() const -> TokenKind {
return result;
}

auto TokenKind::is_one_char_symbol() const -> bool {
static constexpr bool Table[] = {
#define CARBON_TOKEN(TokenName) false,
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) true,
#include "toolchain/lexer/token_kind.def"
};
return Table[AsInt()];
}

auto TokenKind::is_keyword() const -> bool {
static constexpr bool Table[] = {
#define CARBON_TOKEN(TokenName) false,
Expand Down
22 changes: 18 additions & 4 deletions toolchain/lexer/token_kind.def
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@
#define CARBON_SYMBOL_TOKEN(Name, Spelling) CARBON_TOKEN(Name)
#endif

#ifndef CARBON_ONE_CHAR_SYMBOL_TOKEN
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling) \
CARBON_SYMBOL_TOKEN(Name, Spelling)
#endif

#ifndef CARBON_TOKEN_WITH_VIRTUAL_NODE
#define CARBON_TOKEN_WITH_VIRTUAL_NODE(Name) Name
#endif
Expand Down Expand Up @@ -78,7 +83,6 @@ CARBON_SYMBOL_TOKEN(At, "@")
CARBON_SYMBOL_TOKEN(Backslash, "\\")
CARBON_SYMBOL_TOKEN(Caret, "^")
CARBON_SYMBOL_TOKEN(Colon, ":")
CARBON_SYMBOL_TOKEN(Comma, ",")
CARBON_SYMBOL_TOKEN(Equal, "=")
CARBON_SYMBOL_TOKEN(Exclaim, "!")
CARBON_SYMBOL_TOKEN(Greater, ">")
Expand All @@ -89,16 +93,25 @@ CARBON_SYMBOL_TOKEN(Period, ".")
CARBON_SYMBOL_TOKEN(Pipe, "|")
CARBON_SYMBOL_TOKEN(Plus, "+")
CARBON_SYMBOL_TOKEN(Question, "?")
CARBON_SYMBOL_TOKEN(Semi, ";")
CARBON_SYMBOL_TOKEN(Slash, "/")
CARBON_SYMBOL_TOKEN(Star, "*")
CARBON_SYMBOL_TOKEN(Tilde, "~")

// Some Carbon symbols are constructively exactly one character and cannot be
// combined with any other characters to form new symbols. We can lex these
// without needing to max-munch any other characters. These are typically
// expected to be terminators or separators that need to compose with all other
// parts of the grammar. Group symbols are also currently one-character symbols,
// although we may choose to remove that if we need to add composite grouping
// symbols in the future.
CARBON_ONE_CHAR_SYMBOL_TOKEN(Comma, ",")
CARBON_ONE_CHAR_SYMBOL_TOKEN(Semi, ";")

// clang-format on

#ifndef CARBON_OPENING_GROUP_SYMBOL_TOKEN
#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(Name, Spelling, ClosingName) \
CARBON_SYMBOL_TOKEN(Name, Spelling)
CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling)
#endif
CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenParen, "(", CloseParen)
CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenCurlyBrace, "{", CloseCurlyBrace)
Expand All @@ -107,13 +120,14 @@ CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenSquareBracket, "[", CloseSquareBracket)

#ifndef CARBON_CLOSING_GROUP_SYMBOL_TOKEN
#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(Name, Spelling, OpeningName) \
CARBON_SYMBOL_TOKEN(Name, Spelling)
CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling)
#endif
CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseParen, ")", OpenParen)
CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseCurlyBrace, "}", OpenCurlyBrace)
CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseSquareBracket, "]", OpenSquareBracket)
#undef CARBON_CLOSING_GROUP_SYMBOL_TOKEN

#undef CARBON_ONE_CHAR_SYMBOL_TOKEN
#undef CARBON_SYMBOL_TOKEN

#ifndef CARBON_KEYWORD_TOKEN
Expand Down
4 changes: 4 additions & 0 deletions toolchain/lexer/token_kind.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class TokenKind : public CARBON_ENUM_BASE(TokenKind) {
// The token kind must be a closing symbol.
[[nodiscard]] auto opening_symbol() const -> TokenKind;

// Test whether this kind of token is a one-character symbol whose character
// is not part of any other symbol.
[[nodiscard]] auto is_one_char_symbol() const -> bool;

// Test whether this kind of token is a keyword.
[[nodiscard]] auto is_keyword() const -> bool;

Expand Down
156 changes: 131 additions & 25 deletions toolchain/lexer/tokenized_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common/check.h"
#include "common/string_helpers.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ErrorHandling.h"
Expand Down Expand Up @@ -247,6 +248,10 @@ class TokenizedBuffer::Lexer {
bool formed_token_;
};

using DispatchFunctionT = LexResult(Lexer& lexer,
llvm::StringRef& source_text);
chandlerc marked this conversation as resolved.
Show resolved Hide resolved
using DispatchTableT = std::array<DispatchFunctionT*, 256>;

Lexer(TokenizedBuffer& buffer, DiagnosticConsumer& consumer)
: buffer_(&buffer),
translator_(&buffer),
Expand Down Expand Up @@ -351,7 +356,7 @@ class TokenizedBuffer::Lexer {
std::optional<LexedNumericLiteral> literal =
LexedNumericLiteral::Lex(source_text);
if (!literal) {
return LexResult::NoMatch();
return LexError(source_text);
}

int int_column = current_column_;
Expand Down Expand Up @@ -402,7 +407,7 @@ class TokenizedBuffer::Lexer {
std::optional<LexedStringLiteral> literal =
LexedStringLiteral::Lex(source_text);
if (!literal) {
return LexResult::NoMatch();
return LexError(source_text);
}

Line string_line = current_line_;
Expand Down Expand Up @@ -453,16 +458,47 @@ class TokenizedBuffer::Lexer {
}
}

auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
auto LexSymbolToken(llvm::StringRef& source_text, TokenKind kind)
-> LexResult {
// We use the `period` token as a place-holder for cases where one
// character isn't enough to pick a definitive symbol token. Recompute the
// kind using the full symbol set.
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
if (LLVM_UNLIKELY(kind == TokenKind::Period)) {
kind = llvm::StringSwitch<TokenKind>(source_text)
#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
.StartsWith(Spelling, TokenKind::Name)
#include "toolchain/lexer/token_kind.def"
.Default(TokenKind::Error);
if (kind == TokenKind::Error) {
return LexResult::NoMatch();
.Default(TokenKind::Error);
if (kind == TokenKind::Error) {
return LexError(source_text);
}
} else {
#ifndef NDEBUG
// Verify in a debug build that the incoming token kind is correct.
TokenKind debug_kind = llvm::StringSwitch<TokenKind>(source_text)
#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
.StartsWith(Spelling, TokenKind::Name)
#include "toolchain/lexer/token_kind.def"
.Default(TokenKind::Error);
CARBON_CHECK(kind == debug_kind)
<< "Incoming token kind '" << kind
<< "' does not match computed kind '" << debug_kind << "'!";
#endif
chandlerc marked this conversation as resolved.
Show resolved Hide resolved
}

// In debug builds, re-check the kind after our optimizations to make sure
// we get the same result.
#ifndef NDEBUG
TokenKind debug_kind = llvm::StringSwitch<TokenKind>(source_text)
#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
.StartsWith(Spelling, TokenKind::Name)
#include "toolchain/lexer/token_kind.def"
.Default(TokenKind::Error);
CARBON_CHECK(debug_kind == kind)
<< "Optimized code computed kind '" << kind
<< "' but it should have been '" << debug_kind << "'";
#endif
chandlerc marked this conversation as resolved.
Show resolved Hide resolved

if (!set_indent_) {
current_line_info_->indent = current_column_;
set_indent_ = true;
Expand Down Expand Up @@ -609,9 +645,11 @@ class TokenizedBuffer::Lexer {
}

auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
if (!IsAlpha(source_text.front()) && source_text.front() != '_') {
return LexResult::NoMatch();
if (static_cast<unsigned char>(source_text.front()) > 0x7F) {
// TODO: Need to add support for Unicode lexing.
return LexError(source_text);
}
CARBON_CHECK(IsAlpha(source_text.front()) || source_text.front() == '_');

if (!set_indent_) {
current_line_info_->indent = current_column_;
Expand Down Expand Up @@ -692,6 +730,82 @@ class TokenizedBuffer::Lexer {
.column = current_column_});
}

constexpr static auto MakeDispatchTable() -> DispatchTableT {
DispatchTableT table = {};
auto dispatch_lex_error = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexError(source_text);
};
for (int i = 0; i < 256; ++i) {
table[i] = dispatch_lex_error;
}

// First, set the first character of each symbol token spelling to dispatch
// to the symbol lexer. We use a `Period` placeholder for the token as there
// may be several different tokens that start with the same spelling. When
// that placeholder token kind is used, the symbol lexer will compute the
// exact symbol token kind.
auto dispatch_lex_symbol = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexSymbolToken(source_text, TokenKind::Period);
};
#define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = dispatch_lex_symbol;
#include "toolchain/lexer/token_kind.def"

// Now special cased single-character symbols that are guaranteed to not
// join with another symbol. These are grouping symbols, terminators,
// or separators in the grammar and have a good reason to be
// orthogonal to any other punctuation. We do this separately because this
// needs to override soe of the generic handling above.
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = +[](Lexer& lexer, llvm::StringRef& source_text) { \
return lexer.LexSymbolToken(source_text, TokenKind::TokenName); \
};
#include "toolchain/lexer/token_kind.def"

// For identifiers, keywords, and numeric type literals, we use the
// `Identifier` placeholder token kind.
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
auto dispatch_lex_word = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexKeywordOrIdentifier(source_text);
};
table['_'] = dispatch_lex_word;
// Note that we use raw loops because this needs to be `constexpr`
// evaluated.
chandlerc marked this conversation as resolved.
Show resolved Hide resolved
for (unsigned char c = 'a'; c <= 'z'; ++c) {
table[c] = dispatch_lex_word;
}
for (unsigned char c = 'A'; c <= 'Z'; ++c) {
table[c] = dispatch_lex_word;
}
// We dispatch all non-ASCII UTF-8 characters to the identifier lexing
// as whitespace characters should already have been skipped and the
// only remaining valid Unicode characters would be part of an
// identifier. That code can either accept or reject.
for (int i = 0x80; i < 0x100; ++i) {
table[i] = dispatch_lex_word;
}

// For numeric literals of all kinds, we use the `IntegerLiteral`
// placeholder token kind.
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
auto dispatch_lex_numeric =
+[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexNumericLiteral(source_text);
};
for (unsigned char c = '0'; c <= '9'; ++c) {
table[c] = dispatch_lex_numeric;
}

// We can immediately tell when starting to lex a string literal and
// dispatch directly.
auto dispatch_lex_string = +[](Lexer& lexer, llvm::StringRef& source_text) {
return lexer.LexStringLiteral(source_text);
};
table['\''] = dispatch_lex_string;
table['"'] = dispatch_lex_string;
table['#'] = dispatch_lex_string;

return table;
};

private:
TokenizedBuffer* buffer_;

Expand All @@ -716,24 +830,16 @@ auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
ErrorTrackingDiagnosticConsumer error_tracking_consumer(consumer);
Lexer lexer(buffer, error_tracking_consumer);

// Build a table of function pointers that we can use to dispatch to the
// correct lexer routine based on the first byte of source text.
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
constexpr Lexer::DispatchTableT DispatchTable = Lexer::MakeDispatchTable();

llvm::StringRef source_text = source.text();
while (lexer.SkipWhitespace(source_text)) {
zygoloid marked this conversation as resolved.
Show resolved Hide resolved
// Each time we find non-whitespace characters, try each kind of token we
// support lexing, from simplest to most complex.
Lexer::LexResult result = lexer.LexSymbolToken(source_text);
if (!result) {
result = lexer.LexKeywordOrIdentifier(source_text);
}
if (!result) {
result = lexer.LexNumericLiteral(source_text);
}
if (!result) {
result = lexer.LexStringLiteral(source_text);
}
if (!result) {
result = lexer.LexError(source_text);
}
CARBON_CHECK(result) << "No token was lexed.";
Lexer::LexResult result =
DispatchTable[static_cast<unsigned char>(source_text.front())](
lexer, source_text);
CARBON_CHECK(result) << "Failed to form a token!";
}

// The end-of-file token is always considered to be whitespace.
Expand Down
Loading
Loading