-
Notifications
You must be signed in to change notification settings - Fork 225
Faster lexical analyzer #2665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Faster lexical analyzer #2665
Changes from all commits
b1e9cb5
0e910dc
e7bcf64
3872428
70b3c11
52d1de6
cd148f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| require "rbs" | ||
| require "benchmark/ips" | ||
| require "csv" | ||
| require "pathname" | ||
|
|
||
| files = {} | ||
| ARGV.each do |file| | ||
| content = File.read(file) | ||
| files[file] = RBS::Buffer.new(content: content, name: Pathname(file)) | ||
| end | ||
|
|
||
| puts "Benchmarking parsing #{files.size} files..." | ||
|
|
||
| result = Benchmark.ips do |x| | ||
| x.report("parsing") do | ||
| files.each do |file, content| | ||
| RBS::Parser.parse_signature(content) | ||
| end | ||
| end | ||
|
|
||
| x.quiet = true | ||
| end | ||
|
|
||
| entry = result.entries[0] | ||
| puts "✅ #{"%0.3f" % entry.ips} i/s (±#{"%0.3f" % entry.error_percentage}%)" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| require 'rbs' | ||
| require "optparse" | ||
|
|
||
| wait = false | ||
| duration = 3 | ||
|
|
||
| args = ARGV.dup | ||
|
|
||
| OptionParser.new do |opts| | ||
| opts.banner = "Usage: profile-parse.rb [options] FILE" | ||
|
|
||
| opts.on("--wait", "Wait for enter before starting") do | ||
| wait = true | ||
| end | ||
| opts.on("--duration=NUMBER", "Repeat parsing for <NUMBER> seconds") do |number| | ||
| duration = number.to_i | ||
| end | ||
| end.parse!(args) | ||
|
|
||
| if wait | ||
| puts "⏯️ Waiting for enter to continue at #{Process.pid}..." | ||
| STDIN.gets | ||
| end | ||
|
|
||
| file = args.shift or raise "No file path is given" | ||
| sig = File.read(file) | ||
|
|
||
| puts "Parsing #{file} -- #{sig.bytesize} bytes" | ||
|
|
||
| started_at = Time.now | ||
| count = 0 | ||
|
|
||
| loop do | ||
| count += 1 | ||
| RBS::Parser.parse_signature(sig) | ||
| break if (Time.now - started_at) > duration | ||
| end | ||
|
|
||
| puts "✅ Done #{count} loop(s)" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,6 @@ | ||
| #include "rbs/defines.h" | ||
| #include "rbs/lexer.h" | ||
| #include "rbs/util/rbs_assert.h" | ||
|
|
||
| static const char *RBS_TOKENTYPE_NAMES[] = { | ||
| "NullType", | ||
|
|
@@ -112,17 +114,60 @@ int rbs_token_bytes(rbs_token_t tok) { | |
| } | ||
|
|
||
| unsigned int rbs_peek(rbs_lexer_t *lexer) { | ||
| if (lexer->current.char_pos == lexer->end_pos) { | ||
| lexer->last_char = '\0'; | ||
| return 0; | ||
| return lexer->current_code_point; | ||
| } | ||
|
|
||
| bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function assigns the next codepoint in the buffer and it's byte length. |
||
| if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) { | ||
| return false; | ||
| } | ||
|
|
||
| const char *start = lexer->string.start + lexer->current.byte_pos; | ||
|
|
||
| // Fast path for ASCII (single-byte) characters | ||
| if ((unsigned int) *start < 128) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We assume the character encoding of RBS files is ASCII compatible, like Ruby source file. |
||
| *codepoint = (unsigned int) *start; | ||
| *byte_len = 1; | ||
| return true; | ||
| } | ||
|
|
||
| *byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); | ||
|
|
||
| if (*byte_len == 1) { | ||
| *codepoint = (unsigned int) *start; | ||
| } else { | ||
| rbs_string_t str = rbs_string_new( | ||
| lexer->string.start + lexer->current.byte_pos, | ||
| lexer->string.end | ||
| ); | ||
| unsigned int c = rbs_utf8_string_to_codepoint(str); | ||
| lexer->last_char = c; | ||
| return c; | ||
| *codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another hack to support encoding other than utf-8. We may want to return a upper case character to support multi-byte class/constant names. |
||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| void rbs_skip(rbs_lexer_t *lexer) { | ||
| rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0"); | ||
|
|
||
| if (RBS_UNLIKELY(lexer->current_code_point == '\0')) { | ||
| return; | ||
| } | ||
|
|
||
| unsigned int codepoint; | ||
| size_t byte_len; | ||
|
|
||
| lexer->current.byte_pos += lexer->current_character_bytes; | ||
| lexer->current.char_pos += 1; | ||
| if (lexer->current_code_point == '\n') { | ||
| lexer->current.line += 1; | ||
| lexer->current.column = 0; | ||
| lexer->first_token_of_line = true; | ||
| } else { | ||
| lexer->current.column += 1; | ||
| } | ||
|
|
||
| if (rbs_next_char(lexer, &codepoint, &byte_len)) { | ||
| lexer->current_code_point = codepoint; | ||
| lexer->current_character_bytes = byte_len; | ||
| } else { | ||
| lexer->current_character_bytes = 1; | ||
| lexer->current_code_point = '\0'; | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -156,35 +201,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) { | |
| } | ||
| } | ||
|
|
||
| void rbs_skip(rbs_lexer_t *lexer) { | ||
| if (!lexer->last_char) { | ||
| rbs_peek(lexer); | ||
| } | ||
|
|
||
| size_t byte_len; | ||
|
|
||
| if (lexer->last_char == '\0') { | ||
| byte_len = 1; | ||
| } else { | ||
| const char *start = lexer->string.start + lexer->current.byte_pos; | ||
| byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); | ||
| } | ||
|
|
||
| lexer->current.char_pos += 1; | ||
| lexer->current.byte_pos += byte_len; | ||
|
|
||
| if (lexer->last_char == '\n') { | ||
| lexer->current.line += 1; | ||
| lexer->current.column = 0; | ||
| lexer->first_token_of_line = true; | ||
| } else { | ||
| lexer->current.column += 1; | ||
| } | ||
| } | ||
|
|
||
| void rbs_skipn(rbs_lexer_t *lexer, size_t size) { | ||
| for (size_t i = 0; i < size; i++) { | ||
| rbs_peek(lexer); | ||
| rbs_skip(lexer); | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lexer data structure now stores the code point of next character, so that peeking next character can be implemented really faster than reading next character from buffer.