Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -537,3 +537,17 @@ task :compile_c99 do
ensure
ENV.delete("TEST_NO_C23")
end

task :prepare_bench do
ENV.delete("DEBUG")
Rake::Task[:"clobber"].invoke
Rake::Task[:"templates"].invoke
Rake::Task[:"compile"].invoke
end

task :prepare_profiling do
ENV["DEBUG"] = "1"
Rake::Task[:"clobber"].invoke
Rake::Task[:"templates"].invoke
Rake::Task[:"compile"].invoke
end
25 changes: 25 additions & 0 deletions bin/benchmark-parse.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require "rbs"
require "benchmark/ips"
require "csv"
require "pathname"

files = {}
ARGV.each do |file|
content = File.read(file)
files[file] = RBS::Buffer.new(content: content, name: Pathname(file))
end

puts "Benchmarking parsing #{files.size} files..."

result = Benchmark.ips do |x|
x.report("parsing") do
files.each do |file, content|
RBS::Parser.parse_signature(content)
end
end

x.quiet = true
end

entry = result.entries[0]
puts "✅ #{"%0.3f" % entry.ips} i/s (±#{"%0.3f" % entry.error_percentage}%)"
39 changes: 39 additions & 0 deletions bin/profile-parse.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
require 'rbs'
require "optparse"

wait = false
duration = 3

args = ARGV.dup

OptionParser.new do |opts|
opts.banner = "Usage: profile-parse.rb [options] FILE"

opts.on("--wait", "Wait for enter before starting") do
wait = true
end
opts.on("--duration=NUMBER", "Repeat parsing for <NUMBER> seconds") do |number|
duration = number.to_i
end
end.parse!(args)

if wait
puts "⏯️ Waiting for enter to continue at #{Process.pid}..."
STDIN.gets
end

file = args.shift or raise "No file path is given"
sig = File.read(file)

puts "Parsing #{file} -- #{sig.bytesize} bytes"

started_at = Time.now
count = 0

loop do
count += 1
RBS::Parser.parse_signature(sig)
break if (Time.now - started_at) > duration
end

puts "✅ Done #{count} loop(s)"
2 changes: 1 addition & 1 deletion ext/rbs_extension/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
'-Wc++-compat',
]

append_cflags ['-O0', '-g'] if ENV['DEBUG']
append_cflags ['-O0', '-pg'] if ENV['DEBUG']
if ENV["TEST_NO_C23"]
puts "Adding -Wc2x-extensions to CFLAGS"
$CFLAGS << " -Werror -Wc2x-extensions"
Expand Down
18 changes: 18 additions & 0 deletions include/rbs/defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,24 @@
#define RBS_ATTRIBUTE_FORMAT(string_index, argument_index)
#endif

/**
* Support RBS_LIKELY and RBS_UNLIKELY to help the compiler optimize its
* branch predication.
*/
#if defined(__GNUC__) || defined(__clang__)
/** The compiler should predicate that this branch will be taken. */
#define RBS_LIKELY(x) __builtin_expect(!!(x), 1)

/** The compiler should predicate that this branch will not be taken. */
#define RBS_UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
/** Void because this platform does not support branch prediction hints. */
#define RBS_LIKELY(x) (x)

/** Void because this platform does not support branch prediction hints. */
#define RBS_UNLIKELY(x) (x)
#endif

/**
* We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch.
* Use RBS_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional.
Expand Down
38 changes: 27 additions & 11 deletions include/rbs/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,20 +126,26 @@ typedef struct {
* The lexer state is the curren token.
*
* ```
* ... "a string token"
* ^ start position
* ^ current position
* ~~~~~~ Token => "a str
#. 0.1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6
* ... " a s t r i n g t o k e n "
* ^ start position (0)
* ^ current position (6)
* ^ current character ('i', bytes = 1)
* ~~~~~~~~~~~ Token => "a str
* ```
* */
typedef struct {
rbs_string_t string;
int start_pos; /* The character position that defines the start of the input */
int end_pos; /* The character position that defines the end of the input */
rbs_position_t current; /* The current position */
rbs_position_t start; /* The start position of the current token */
int start_pos; /* The character position that defines the start of the input */
int end_pos; /* The character position that defines the end of the input */
rbs_position_t current; /* The current position: just before the current_character */
rbs_position_t start; /* The start position of the current token */

unsigned int current_code_point; /* Current character code point */
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lexer data structure now stores the code point of next character, so that peeking next character can be implemented really faster than reading next character from buffer.

size_t current_character_bytes; /* Current character byte length (0 or 1~4) */

bool first_token_of_line; /* This flag is used for tLINECOMMENT */
unsigned int last_char; /* Last peeked character */

const rbs_encoding_t *encoding;
} rbs_lexer_t;

Expand All @@ -159,15 +165,23 @@ int rbs_token_bytes(rbs_token_t tok);
const char *rbs_token_type_str(enum RBSTokenType type);

/**
* Read next character.
* Returns the next character.
* */
unsigned int rbs_peek(rbs_lexer_t *lexer);

/**
* Skip one character.
* Advances the current position by one character.
* */
void rbs_skip(rbs_lexer_t *lexer);

/**
* Read next character and store the codepoint and byte length to the given pointers.
*
* This doesn't update the lexer state.
* Returns `true` if succeeded, or `false` if reached to EOF.
* */
bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *bytes);

/**
* Skip n characters.
* */
Expand All @@ -187,4 +201,6 @@ rbs_token_t rbs_lexer_next_token(rbs_lexer_t *lexer);

void rbs_print_token(rbs_token_t tok);

void rbs_print_lexer(rbs_lexer_t *lexer);

#endif
92 changes: 55 additions & 37 deletions src/lexstate.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "rbs/defines.h"
#include "rbs/lexer.h"
#include "rbs/util/rbs_assert.h"

static const char *RBS_TOKENTYPE_NAMES[] = {
"NullType",
Expand Down Expand Up @@ -112,17 +114,60 @@ int rbs_token_bytes(rbs_token_t tok) {
}

unsigned int rbs_peek(rbs_lexer_t *lexer) {
if (lexer->current.char_pos == lexer->end_pos) {
lexer->last_char = '\0';
return 0;
return lexer->current_code_point;
}

bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function assigns the next codepoint in the buffer and it's byte length.

if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) {
return false;
}

const char *start = lexer->string.start + lexer->current.byte_pos;

// Fast path for ASCII (single-byte) characters
if ((unsigned int) *start < 128) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We assume the character encoding of RBS files is ASCII compatible, like Ruby source file.
If it is ASCII character, it is a single-byte character.

*codepoint = (unsigned int) *start;
*byte_len = 1;
return true;
}

*byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));

if (*byte_len == 1) {
*codepoint = (unsigned int) *start;
} else {
rbs_string_t str = rbs_string_new(
lexer->string.start + lexer->current.byte_pos,
lexer->string.end
);
unsigned int c = rbs_utf8_string_to_codepoint(str);
lexer->last_char = c;
return c;
*codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another hack to support encoding other than utf-8.
It doesn't know the exact unicode code point of the next character in other encoding, and it returns a random code point instead. Lexer reads the character, but because the random character doesn't have any meaning for lexer, it works perfectly.

We may want to return a upper case character to support multi-byte class/constant names.

}

return true;
}

void rbs_skip(rbs_lexer_t *lexer) {
rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0");

if (RBS_UNLIKELY(lexer->current_code_point == '\0')) {
return;
}

unsigned int codepoint;
size_t byte_len;

lexer->current.byte_pos += lexer->current_character_bytes;
lexer->current.char_pos += 1;
if (lexer->current_code_point == '\n') {
lexer->current.line += 1;
lexer->current.column = 0;
lexer->first_token_of_line = true;
} else {
lexer->current.column += 1;
}

if (rbs_next_char(lexer, &codepoint, &byte_len)) {
lexer->current_code_point = codepoint;
lexer->current_character_bytes = byte_len;
} else {
lexer->current_character_bytes = 1;
lexer->current_code_point = '\0';
}
}

Expand Down Expand Up @@ -156,35 +201,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) {
}
}

void rbs_skip(rbs_lexer_t *lexer) {
if (!lexer->last_char) {
rbs_peek(lexer);
}

size_t byte_len;

if (lexer->last_char == '\0') {
byte_len = 1;
} else {
const char *start = lexer->string.start + lexer->current.byte_pos;
byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start));
}

lexer->current.char_pos += 1;
lexer->current.byte_pos += byte_len;

if (lexer->last_char == '\n') {
lexer->current.line += 1;
lexer->current.column = 0;
lexer->first_token_of_line = true;
} else {
lexer->current.column += 1;
}
}

void rbs_skipn(rbs_lexer_t *lexer, size_t size) {
for (size_t i = 0; i < size; i++) {
rbs_peek(lexer);
rbs_skip(lexer);
}
}
Expand Down
31 changes: 27 additions & 4 deletions src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <string.h>

#include "rbs/defines.h"
#include "rbs/lexer.h"
#include "rbs/string.h"
#include "rbs/util/rbs_unescape.h"
#include "rbs/util/rbs_buffer.h"
Expand Down Expand Up @@ -3456,6 +3457,14 @@ void rbs_print_token(rbs_token_t tok) {
);
}

void rbs_print_lexer(rbs_lexer_t *lexer) {
printf("Lexer: (range = %d...%d, encoding = %s\n", lexer->start_pos, lexer->end_pos, lexer->encoding->name);
printf(" start = { char_pos = %d, byte_pos = %d }\n", lexer->start.char_pos, lexer->start.byte_pos);
printf(" current = { char_pos = %d, byte_pos = %d }\n", lexer->current.char_pos, lexer->current.byte_pos);
printf(" character = { code_point = %d (%c), bytes = %zu }\n", lexer->current_code_point, lexer->current_code_point < 256 ? lexer->current_code_point : '?', lexer->current_character_bytes);
printf(" first_token_of_line = %s\n", lexer->first_token_of_line ? "true" : "false");
}

rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line) {
int comment_line = subject_line - 1;

Expand Down Expand Up @@ -3484,14 +3493,28 @@ rbs_lexer_t *rbs_lexer_new(rbs_allocator_t *allocator, rbs_string_t string, cons
.end_pos = end_pos,
.current = start_position,
.start = { 0 },
.first_token_of_line = false,
.last_char = 0,
.first_token_of_line = true,
.current_character_bytes = 0,
.current_code_point = '\0',
.encoding = encoding,
};

rbs_skipn(lexer, start_pos);
unsigned int codepoint;
size_t bytes;

if (rbs_next_char(lexer, &codepoint, &bytes)) {
lexer->current_code_point = codepoint;
lexer->current_character_bytes = bytes;
} else {
lexer->current_code_point = '\0';
lexer->current_character_bytes = 1;
}

if (start_pos > 0) {
rbs_skipn(lexer, start_pos);
}

lexer->start = lexer->current;
lexer->first_token_of_line = lexer->current.column == 0;

return lexer;
}
Expand Down
3 changes: 2 additions & 1 deletion src/string.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "rbs/string.h"
#include "rbs/defines.h"

#include <stdlib.h>
#include <string.h>
Expand All @@ -14,7 +15,7 @@ unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {

if (s >= end) return 0; // End of string

if ((*s & 0x80) == 0) {
if (RBS_LIKELY((*s & 0x80) == 0)) {
// Single byte character (0xxxxxxx)
return *s;
} else if ((*s & 0xE0) == 0xC0) {
Expand Down
Loading
Loading