Skip to content

Commit a10410e

Browse files
committed
Merge pull request #2665 from ruby/fix-lexer
Faster lexical analyzer
1 parent 9743a4c commit a10410e

File tree

10 files changed

+212
-56
lines changed

10 files changed

+212
-56
lines changed

Rakefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,3 +522,17 @@ task :compile_c99 do
522522
ensure
523523
ENV.delete("TEST_NO_C23")
524524
end
525+
526+
task :prepare_bench do
527+
ENV.delete("DEBUG")
528+
Rake::Task[:"clobber"].invoke
529+
Rake::Task[:"templates"].invoke
530+
Rake::Task[:"compile"].invoke
531+
end
532+
533+
task :prepare_profiling do
534+
ENV["DEBUG"] = "1"
535+
Rake::Task[:"clobber"].invoke
536+
Rake::Task[:"templates"].invoke
537+
Rake::Task[:"compile"].invoke
538+
end

bin/benchmark-parse.rb

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
require "rbs"
2+
require "benchmark/ips"
3+
require "csv"
4+
require "pathname"
5+
6+
files = {}
7+
ARGV.each do |file|
8+
content = File.read(file)
9+
files[file] = RBS::Buffer.new(content: content, name: Pathname(file))
10+
end
11+
12+
puts "Benchmarking parsing #{files.size} files..."
13+
14+
result = Benchmark.ips do |x|
15+
x.report("parsing") do
16+
files.each do |file, content|
17+
RBS::Parser.parse_signature(content)
18+
end
19+
end
20+
21+
x.quiet = true
22+
end
23+
24+
entry = result.entries[0]
25+
puts "✅ #{"%0.3f" % entry.ips} i/s (±#{"%0.3f" % entry.error_percentage}%)"

bin/profile-parse.rb

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
require 'rbs'
2+
require "optparse"
3+
4+
wait = false
5+
duration = 3
6+
7+
args = ARGV.dup
8+
9+
OptionParser.new do |opts|
10+
opts.banner = "Usage: profile-parse.rb [options] FILE"
11+
12+
opts.on("--wait", "Wait for enter before starting") do
13+
wait = true
14+
end
15+
opts.on("--duration=NUMBER", "Repeat parsing for <NUMBER> seconds") do |number|
16+
duration = number.to_i
17+
end
18+
end.parse!(args)
19+
20+
if wait
21+
puts "⏯️ Waiting for enter to continue at #{Process.pid}..."
22+
STDIN.gets
23+
end
24+
25+
file = args.shift or raise "No file path is given"
26+
sig = File.read(file)
27+
28+
puts "Parsing #{file} -- #{sig.bytesize} bytes"
29+
30+
started_at = Time.now
31+
count = 0
32+
33+
loop do
34+
count += 1
35+
RBS::Parser.parse_signature(sig)
36+
break if (Time.now - started_at) > duration
37+
end
38+
39+
puts "✅ Done #{count} loop(s)"

ext/rbs_extension/extconf.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
'-Wc++-compat',
1919
]
2020

21-
append_cflags ['-O0', '-g'] if ENV['DEBUG']
21+
append_cflags ['-O0', '-pg'] if ENV['DEBUG']
2222
if ENV["TEST_NO_C23"]
2323
puts "Adding -Wc2x-extensions to CFLAGS"
2424
$CFLAGS << " -Werror -Wc2x-extensions"

include/rbs/defines.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,24 @@
3232
#define RBS_ATTRIBUTE_FORMAT(string_index, argument_index)
3333
#endif
3434

35+
/**
36+
* Support RBS_LIKELY and RBS_UNLIKELY to help the compiler optimize its
37+
* branch predication.
38+
*/
39+
#if defined(__GNUC__) || defined(__clang__)
40+
/** The compiler should predicate that this branch will be taken. */
41+
#define RBS_LIKELY(x) __builtin_expect(!!(x), 1)
42+
43+
/** The compiler should predicate that this branch will not be taken. */
44+
#define RBS_UNLIKELY(x) __builtin_expect(!!(x), 0)
45+
#else
46+
/** Void because this platform does not support branch prediction hints. */
47+
#define RBS_LIKELY(x) (x)
48+
49+
/** Void because this platform does not support branch prediction hints. */
50+
#define RBS_UNLIKELY(x) (x)
51+
#endif
52+
3553
/**
3654
* We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch.
3755
* Use RBS_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional.

include/rbs/lexer.h

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -119,20 +119,26 @@ typedef struct {
119119
* The lexer state is the curren token.
120120
*
121121
* ```
122-
* ... "a string token"
123-
* ^ start position
124-
* ^ current position
125-
* ~~~~~~ Token => "a str
122+
#. 0.1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6
123+
* ... " a s t r i n g t o k e n "
124+
* ^ start position (0)
125+
* ^ current position (6)
126+
* ^ current character ('i', bytes = 1)
127+
* ~~~~~~~~~~~ Token => "a str
126128
* ```
127129
* */
128130
typedef struct {
129131
rbs_string_t string;
130-
int start_pos; /* The character position that defines the start of the input */
131-
int end_pos; /* The character position that defines the end of the input */
132-
rbs_position_t current; /* The current position */
133-
rbs_position_t start; /* The start position of the current token */
132+
int start_pos; /* The character position that defines the start of the input */
133+
int end_pos; /* The character position that defines the end of the input */
134+
rbs_position_t current; /* The current position: just before the current_character */
135+
rbs_position_t start; /* The start position of the current token */
136+
137+
unsigned int current_code_point; /* Current character code point */
138+
size_t current_character_bytes; /* Current character byte length (0 or 1~4) */
139+
134140
bool first_token_of_line; /* This flag is used for tLINECOMMENT */
135-
unsigned int last_char; /* Last peeked character */
141+
136142
const rbs_encoding_t *encoding;
137143
} rbs_lexer_t;
138144

@@ -152,15 +158,23 @@ int rbs_token_bytes(rbs_token_t tok);
152158
const char *rbs_token_type_str(enum RBSTokenType type);
153159

154160
/**
155-
* Read next character.
161+
* Returns the next character.
156162
* */
157163
unsigned int rbs_peek(rbs_lexer_t *lexer);
158164

159165
/**
160-
* Skip one character.
166+
* Advances the current position by one character.
161167
* */
162168
void rbs_skip(rbs_lexer_t *lexer);
163169

170+
/**
171+
* Read next character and store the codepoint and byte length to the given pointers.
172+
*
173+
* This doesn't update the lexer state.
174+
* Returns `true` if succeeded, or `false` if reached to EOF.
175+
* */
176+
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *bytes);
177+
164178
/**
165179
* Skip n characters.
166180
* */
@@ -180,4 +194,6 @@ rbs_token_t rbs_lexer_next_token(rbs_lexer_t *lexer);
180194

181195
void rbs_print_token(rbs_token_t tok);
182196

197+
void rbs_print_lexer(rbs_lexer_t *lexer);
198+
183199
#endif

src/lexstate.c

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#include "rbs/defines.h"
12
#include "rbs/lexer.h"
3+
#include "rbs/util/rbs_assert.h"
24

35
static const char *RBS_TOKENTYPE_NAMES[] = {
46
"NullType",
@@ -105,17 +107,60 @@ int rbs_token_bytes(rbs_token_t tok) {
105107
}
106108

107109
unsigned int rbs_peek(rbs_lexer_t *lexer) {
108-
if (lexer->current.char_pos == lexer->end_pos) {
109-
lexer->last_char = '\0';
110-
return 0;
110+
return lexer->current_code_point;
111+
}
112+
113+
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) {
114+
if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) {
115+
return false;
116+
}
117+
118+
const char *start = lexer->string.start + lexer->current.byte_pos;
119+
120+
// Fast path for ASCII (single-byte) characters
121+
if ((unsigned int) *start < 128) {
122+
*codepoint = (unsigned int) *start;
123+
*byte_len = 1;
124+
return true;
125+
}
126+
127+
*byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
128+
129+
if (*byte_len == 1) {
130+
*codepoint = (unsigned int) *start;
111131
} else {
112-
rbs_string_t str = rbs_string_new(
113-
lexer->string.start + lexer->current.byte_pos,
114-
lexer->string.end
115-
);
116-
unsigned int c = rbs_utf8_string_to_codepoint(str);
117-
lexer->last_char = c;
118-
return c;
132+
*codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode
133+
}
134+
135+
return true;
136+
}
137+
138+
void rbs_skip(rbs_lexer_t *lexer) {
139+
rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0");
140+
141+
if (RBS_UNLIKELY(lexer->current_code_point == '\0')) {
142+
return;
143+
}
144+
145+
unsigned int codepoint;
146+
size_t byte_len;
147+
148+
lexer->current.byte_pos += lexer->current_character_bytes;
149+
lexer->current.char_pos += 1;
150+
if (lexer->current_code_point == '\n') {
151+
lexer->current.line += 1;
152+
lexer->current.column = 0;
153+
lexer->first_token_of_line = true;
154+
} else {
155+
lexer->current.column += 1;
156+
}
157+
158+
if (rbs_next_char(lexer, &codepoint, &byte_len)) {
159+
lexer->current_code_point = codepoint;
160+
lexer->current_character_bytes = byte_len;
161+
} else {
162+
lexer->current_character_bytes = 1;
163+
lexer->current_code_point = '\0';
119164
}
120165
}
121166

@@ -149,35 +194,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) {
149194
}
150195
}
151196

152-
void rbs_skip(rbs_lexer_t *lexer) {
153-
if (!lexer->last_char) {
154-
rbs_peek(lexer);
155-
}
156-
157-
size_t byte_len;
158-
159-
if (lexer->last_char == '\0') {
160-
byte_len = 1;
161-
} else {
162-
const char *start = lexer->string.start + lexer->current.byte_pos;
163-
byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
164-
}
165-
166-
lexer->current.char_pos += 1;
167-
lexer->current.byte_pos += byte_len;
168-
169-
if (lexer->last_char == '\n') {
170-
lexer->current.line += 1;
171-
lexer->current.column = 0;
172-
lexer->first_token_of_line = true;
173-
} else {
174-
lexer->current.column += 1;
175-
}
176-
}
177-
178197
void rbs_skipn(rbs_lexer_t *lexer, size_t size) {
179198
for (size_t i = 0; i < size; i++) {
180-
rbs_peek(lexer);
181199
rbs_skip(lexer);
182200
}
183201
}

src/parser.c

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <string.h>
88

99
#include "rbs/defines.h"
10+
#include "rbs/lexer.h"
1011
#include "rbs/string.h"
1112
#include "rbs/util/rbs_unescape.h"
1213
#include "rbs/util/rbs_buffer.h"
@@ -3327,6 +3328,14 @@ void rbs_print_token(rbs_token_t tok) {
33273328
);
33283329
}
33293330

3331+
void rbs_print_lexer(rbs_lexer_t *lexer) {
3332+
printf("Lexer: (range = %d...%d, encoding = %s\n", lexer->start_pos, lexer->end_pos, lexer->encoding->name);
3333+
printf(" start = { char_pos = %d, byte_pos = %d }\n", lexer->start.char_pos, lexer->start.byte_pos);
3334+
printf(" current = { char_pos = %d, byte_pos = %d }\n", lexer->current.char_pos, lexer->current.byte_pos);
3335+
printf(" character = { code_point = %d (%c), bytes = %zu }\n", lexer->current_code_point, lexer->current_code_point < 256 ? lexer->current_code_point : '?', lexer->current_character_bytes);
3336+
printf(" first_token_of_line = %s\n", lexer->first_token_of_line ? "true" : "false");
3337+
}
3338+
33303339
rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line) {
33313340
int comment_line = subject_line - 1;
33323341

@@ -3355,14 +3364,28 @@ rbs_lexer_t *rbs_lexer_new(rbs_allocator_t *allocator, rbs_string_t string, cons
33553364
.end_pos = end_pos,
33563365
.current = start_position,
33573366
.start = { 0 },
3358-
.first_token_of_line = false,
3359-
.last_char = 0,
3367+
.first_token_of_line = true,
3368+
.current_character_bytes = 0,
3369+
.current_code_point = '\0',
33603370
.encoding = encoding,
33613371
};
33623372

3363-
rbs_skipn(lexer, start_pos);
3373+
unsigned int codepoint;
3374+
size_t bytes;
3375+
3376+
if (rbs_next_char(lexer, &codepoint, &bytes)) {
3377+
lexer->current_code_point = codepoint;
3378+
lexer->current_character_bytes = bytes;
3379+
} else {
3380+
lexer->current_code_point = '\0';
3381+
lexer->current_character_bytes = 1;
3382+
}
3383+
3384+
if (start_pos > 0) {
3385+
rbs_skipn(lexer, start_pos);
3386+
}
3387+
33643388
lexer->start = lexer->current;
3365-
lexer->first_token_of_line = lexer->current.column == 0;
33663389

33673390
return lexer;
33683391
}

src/string.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "rbs/string.h"
2+
#include "rbs/defines.h"
23

34
#include <stdlib.h>
45
#include <string.h>
@@ -14,7 +15,7 @@ unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
1415

1516
if (s >= end) return 0; // End of string
1617

17-
if ((*s & 0x80) == 0) {
18+
if (RBS_LIKELY((*s & 0x80) == 0)) {
1819
// Single byte character (0xxxxxxx)
1920
return *s;
2021
} else if ((*s & 0xE0) == 0xC0) {

0 commit comments

Comments
 (0)