diff --git a/Rakefile b/Rakefile index 2a9541de8..6400f5379 100644 --- a/Rakefile +++ b/Rakefile @@ -537,3 +537,17 @@ task :compile_c99 do ensure ENV.delete("TEST_NO_C23") end + +task :prepare_bench do + ENV.delete("DEBUG") + Rake::Task[:"clobber"].invoke + Rake::Task[:"templates"].invoke + Rake::Task[:"compile"].invoke +end + +task :prepare_profiling do + ENV["DEBUG"] = "1" + Rake::Task[:"clobber"].invoke + Rake::Task[:"templates"].invoke + Rake::Task[:"compile"].invoke +end \ No newline at end of file diff --git a/bin/benchmark-parse.rb b/bin/benchmark-parse.rb new file mode 100644 index 000000000..98e4ce16d --- /dev/null +++ b/bin/benchmark-parse.rb @@ -0,0 +1,25 @@ +require "rbs" +require "benchmark/ips" +require "csv" +require "pathname" + +files = {} +ARGV.each do |file| + content = File.read(file) + files[file] = RBS::Buffer.new(content: content, name: Pathname(file)) +end + +puts "Benchmarking parsing #{files.size} files..." + +result = Benchmark.ips do |x| + x.report("parsing") do + files.each do |file, content| + RBS::Parser.parse_signature(content) + end + end + + x.quiet = true +end + +entry = result.entries[0] +puts "✅ #{"%0.3f" % entry.ips} i/s (±#{"%0.3f" % entry.error_percentage}%)" diff --git a/bin/profile-parse.rb b/bin/profile-parse.rb new file mode 100644 index 000000000..3f9f75dbb --- /dev/null +++ b/bin/profile-parse.rb @@ -0,0 +1,39 @@ +require 'rbs' +require "optparse" + +wait = false +duration = 3 + +args = ARGV.dup + +OptionParser.new do |opts| + opts.banner = "Usage: profile-parse.rb [options] FILE" + + opts.on("--wait", "Wait for enter before starting") do + wait = true + end + opts.on("--duration=NUMBER", "Repeat parsing for seconds") do |number| + duration = number.to_i + end +end.parse!(args) + +if wait + puts "⏯️ Waiting for enter to continue at #{Process.pid}..." + STDIN.gets +end + +file = args.shift or raise "No file path is given" +sig = File.read(file) + +puts "Parsing #{file} -- #{sig.bytesize} bytes" + +started_at = Time.now +count = 0 + +loop do + count += 1 + RBS::Parser.parse_signature(sig) + break if (Time.now - started_at) > duration +end + +puts "✅ Done #{count} loop(s)" \ No newline at end of file diff --git a/ext/rbs_extension/extconf.rb b/ext/rbs_extension/extconf.rb index 88e971c3e..eb25ec562 100644 --- a/ext/rbs_extension/extconf.rb +++ b/ext/rbs_extension/extconf.rb @@ -18,7 +18,7 @@ '-Wc++-compat', ] -append_cflags ['-O0', '-g'] if ENV['DEBUG'] +append_cflags ['-O0', '-pg'] if ENV['DEBUG'] if ENV["TEST_NO_C23"] puts "Adding -Wc2x-extensions to CFLAGS" $CFLAGS << " -Werror -Wc2x-extensions" diff --git a/include/rbs/defines.h b/include/rbs/defines.h index 6dc2091a6..f193ab5f4 100644 --- a/include/rbs/defines.h +++ b/include/rbs/defines.h @@ -32,6 +32,24 @@ #define RBS_ATTRIBUTE_FORMAT(string_index, argument_index) #endif +/** + * Support RBS_LIKELY and RBS_UNLIKELY to help the compiler optimize its + * branch predication. + */ +#if defined(__GNUC__) || defined(__clang__) +/** The compiler should predicate that this branch will be taken. */ +#define RBS_LIKELY(x) __builtin_expect(!!(x), 1) + +/** The compiler should predicate that this branch will not be taken. */ +#define RBS_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +/** Void because this platform does not support branch prediction hints. */ +#define RBS_LIKELY(x) (x) + +/** Void because this platform does not support branch prediction hints. */ +#define RBS_UNLIKELY(x) (x) +#endif + /** * We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch. * Use RBS_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional. diff --git a/include/rbs/lexer.h b/include/rbs/lexer.h index ec3bad468..6fd527959 100644 --- a/include/rbs/lexer.h +++ b/include/rbs/lexer.h @@ -126,20 +126,26 @@ typedef struct { * The lexer state is the curren token. * * ``` - * ... "a string token" - * ^ start position - * ^ current position - * ~~~~~~ Token => "a str + #. 0.1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6 + * ... " a s t r i n g t o k e n " + * ^ start position (0) + * ^ current position (6) + * ^ current character ('i', bytes = 1) + * ~~~~~~~~~~~ Token => "a str * ``` * */ typedef struct { rbs_string_t string; - int start_pos; /* The character position that defines the start of the input */ - int end_pos; /* The character position that defines the end of the input */ - rbs_position_t current; /* The current position */ - rbs_position_t start; /* The start position of the current token */ + int start_pos; /* The character position that defines the start of the input */ + int end_pos; /* The character position that defines the end of the input */ + rbs_position_t current; /* The current position: just before the current_character */ + rbs_position_t start; /* The start position of the current token */ + + unsigned int current_code_point; /* Current character code point */ + size_t current_character_bytes; /* Current character byte length (0 or 1~4) */ + bool first_token_of_line; /* This flag is used for tLINECOMMENT */ - unsigned int last_char; /* Last peeked character */ + const rbs_encoding_t *encoding; } rbs_lexer_t; @@ -159,15 +165,23 @@ int rbs_token_bytes(rbs_token_t tok); const char *rbs_token_type_str(enum RBSTokenType type); /** - * Read next character. + * Returns the next character. * */ unsigned int rbs_peek(rbs_lexer_t *lexer); /** - * Skip one character. + * Advances the current position by one character. * */ void rbs_skip(rbs_lexer_t *lexer); +/** + * Read next character and store the codepoint and byte length to the given pointers. + * + * This doesn't update the lexer state. + * Returns `true` if succeeded, or `false` if reached to EOF. + * */ +bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *bytes); + /** * Skip n characters. * */ @@ -187,4 +201,6 @@ rbs_token_t rbs_lexer_next_token(rbs_lexer_t *lexer); void rbs_print_token(rbs_token_t tok); +void rbs_print_lexer(rbs_lexer_t *lexer); + #endif diff --git a/src/lexstate.c b/src/lexstate.c index 8e9e3e03a..d2241eb42 100644 --- a/src/lexstate.c +++ b/src/lexstate.c @@ -1,4 +1,6 @@ +#include "rbs/defines.h" #include "rbs/lexer.h" +#include "rbs/util/rbs_assert.h" static const char *RBS_TOKENTYPE_NAMES[] = { "NullType", @@ -112,17 +114,60 @@ int rbs_token_bytes(rbs_token_t tok) { } unsigned int rbs_peek(rbs_lexer_t *lexer) { - if (lexer->current.char_pos == lexer->end_pos) { - lexer->last_char = '\0'; - return 0; + return lexer->current_code_point; +} + +bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) { + if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) { + return false; + } + + const char *start = lexer->string.start + lexer->current.byte_pos; + + // Fast path for ASCII (single-byte) characters + if ((unsigned int) *start < 128) { + *codepoint = (unsigned int) *start; + *byte_len = 1; + return true; + } + + *byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); + + if (*byte_len == 1) { + *codepoint = (unsigned int) *start; } else { - rbs_string_t str = rbs_string_new( - lexer->string.start + lexer->current.byte_pos, - lexer->string.end - ); - unsigned int c = rbs_utf8_string_to_codepoint(str); - lexer->last_char = c; - return c; + *codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode + } + + return true; +} + +void rbs_skip(rbs_lexer_t *lexer) { + rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0"); + + if (RBS_UNLIKELY(lexer->current_code_point == '\0')) { + return; + } + + unsigned int codepoint; + size_t byte_len; + + lexer->current.byte_pos += lexer->current_character_bytes; + lexer->current.char_pos += 1; + if (lexer->current_code_point == '\n') { + lexer->current.line += 1; + lexer->current.column = 0; + lexer->first_token_of_line = true; + } else { + lexer->current.column += 1; + } + + if (rbs_next_char(lexer, &codepoint, &byte_len)) { + lexer->current_code_point = codepoint; + lexer->current_character_bytes = byte_len; + } else { + lexer->current_character_bytes = 1; + lexer->current_code_point = '\0'; } } @@ -156,35 +201,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) { } } -void rbs_skip(rbs_lexer_t *lexer) { - if (!lexer->last_char) { - rbs_peek(lexer); - } - - size_t byte_len; - - if (lexer->last_char == '\0') { - byte_len = 1; - } else { - const char *start = lexer->string.start + lexer->current.byte_pos; - byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); - } - - lexer->current.char_pos += 1; - lexer->current.byte_pos += byte_len; - - if (lexer->last_char == '\n') { - lexer->current.line += 1; - lexer->current.column = 0; - lexer->first_token_of_line = true; - } else { - lexer->current.column += 1; - } -} - void rbs_skipn(rbs_lexer_t *lexer, size_t size) { for (size_t i = 0; i < size; i++) { - rbs_peek(lexer); rbs_skip(lexer); } } diff --git a/src/parser.c b/src/parser.c index 0cbb4bf12..93c7e1b08 100644 --- a/src/parser.c +++ b/src/parser.c @@ -7,6 +7,7 @@ #include #include "rbs/defines.h" +#include "rbs/lexer.h" #include "rbs/string.h" #include "rbs/util/rbs_unescape.h" #include "rbs/util/rbs_buffer.h" @@ -3456,6 +3457,14 @@ void rbs_print_token(rbs_token_t tok) { ); } +void rbs_print_lexer(rbs_lexer_t *lexer) { + printf("Lexer: (range = %d...%d, encoding = %s\n", lexer->start_pos, lexer->end_pos, lexer->encoding->name); + printf(" start = { char_pos = %d, byte_pos = %d }\n", lexer->start.char_pos, lexer->start.byte_pos); + printf(" current = { char_pos = %d, byte_pos = %d }\n", lexer->current.char_pos, lexer->current.byte_pos); + printf(" character = { code_point = %d (%c), bytes = %zu }\n", lexer->current_code_point, lexer->current_code_point < 256 ? lexer->current_code_point : '?', lexer->current_character_bytes); + printf(" first_token_of_line = %s\n", lexer->first_token_of_line ? "true" : "false"); +} + rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line) { int comment_line = subject_line - 1; @@ -3484,14 +3493,28 @@ rbs_lexer_t *rbs_lexer_new(rbs_allocator_t *allocator, rbs_string_t string, cons .end_pos = end_pos, .current = start_position, .start = { 0 }, - .first_token_of_line = false, - .last_char = 0, + .first_token_of_line = true, + .current_character_bytes = 0, + .current_code_point = '\0', .encoding = encoding, }; - rbs_skipn(lexer, start_pos); + unsigned int codepoint; + size_t bytes; + + if (rbs_next_char(lexer, &codepoint, &bytes)) { + lexer->current_code_point = codepoint; + lexer->current_character_bytes = bytes; + } else { + lexer->current_code_point = '\0'; + lexer->current_character_bytes = 1; + } + + if (start_pos > 0) { + rbs_skipn(lexer, start_pos); + } + lexer->start = lexer->current; - lexer->first_token_of_line = lexer->current.column == 0; return lexer; } diff --git a/src/string.c b/src/string.c index dc4e87f7b..cc7de5e98 100644 --- a/src/string.c +++ b/src/string.c @@ -1,4 +1,5 @@ #include "rbs/string.h" +#include "rbs/defines.h" #include #include @@ -14,7 +15,7 @@ unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) { if (s >= end) return 0; // End of string - if ((*s & 0x80) == 0) { + if (RBS_LIKELY((*s & 0x80) == 0)) { // Single byte character (0xxxxxxx) return *s; } else if ((*s & 0xE0) == 0xC0) { diff --git a/src/util/rbs_encoding.c b/src/util/rbs_encoding.c index 9516e2a4e..b8c5e58a5 100644 --- a/src/util/rbs_encoding.c +++ b/src/util/rbs_encoding.c @@ -4620,6 +4620,7 @@ rbs_unicode_codepoint_match(rbs_unicode_codepoint_t codepoint, const rbs_unicode * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +// clang-format off static const uint8_t rbs_utf_8_dfa[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f 0, @@ -4991,6 +4992,7 @@ static const uint8_t rbs_utf_8_dfa[] = { 1, 1, // s7..s8 }; +// clang-format on /** * Given a pointer to a string and the number of bytes remaining in the string, @@ -4999,7 +5001,7 @@ static const uint8_t rbs_utf_8_dfa[] = { */ static rbs_unicode_codepoint_t rbs_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { - rbs_assert(n >= 0, "n must be greater than or equal to 0. Got %ti", n); + rbs_assert(n >= 0, "[rbs_unicode_codepoint_t] n must be greater than or equal to 0. Got %ti", n); size_t maximum = (n > 4) ? 4 : ((size_t) n); uint32_t codepoint; @@ -5029,7 +5031,7 @@ rbs_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { */ size_t rbs_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) { - rbs_assert(n >= 0, "n must be greater than or equal to 0. Got %ti", n); + rbs_assert(n >= 0, "[rbs_encoding_utf_8_char_width] n must be greater than or equal to 0. Got %ti", n); size_t maximum = (n > 4) ? 4 : ((size_t) n); uint32_t state = 0;