Skip to content

Commit d3af1b8

Browse files
committed
Enhance keep_tokens option for RubyVM::AbstractSyntaxTree parsing methods
Implementation for Language Server Protocol (LSP) sometimes needs token information. For example both `m(1)` and `m(1, )` has same AST structure other than node locations then it's impossible to check the existence of `,` from AST. However in later case, it might be better to suggest variables list for the second argument. Token information is important for such case. This commit adds these methods. * Add `keep_tokens` option for `RubyVM::AbstractSyntaxTree.parse`, `.parse_file` and `.of` * Add `RubyVM::AbstractSyntaxTree::Node#tokens` which returns tokens for the node including tokens for descendants nodes. * Add `RubyVM::AbstractSyntaxTree::Node#all_tokens` which returns all tokens for the input script regardless the receiver node. [Feature #19070]
1 parent 082cfcf commit d3af1b8

File tree

9 files changed

+556
-104
lines changed

9 files changed

+556
-104
lines changed

NEWS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,14 @@ Note: We're only listing outstanding class updates.
192192

193193
* RubyVM::AbstractSyntaxTree
194194
* Add `error_tolerant` option for `parse`, `parse_file` and `of`. [[Feature #19013]]
195+
* Add `keep_tokens` option for `parse`, `parse_file` and `of`. Add `#tokens` and `#all_tokens`
196+
for `RubyVM::AbstractSyntaxTree::Node` [[Feature #19070]]
197+
198+
```ruby
199+
root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
200+
root.tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
201+
root.tokens.map{_1[2]}.join # => "x = 1 + 2"
202+
```
195203

196204
* Set
197205
* Set is now available as a built-in class without the need for `require "set"`. [[Feature #16989]]

ast.c

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ ast_new_internal(rb_ast_t *ast, const NODE *node)
6464
return obj;
6565
}
6666

67-
static VALUE rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant);
68-
static VALUE rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant);
67+
static VALUE rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens);
68+
static VALUE rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens);
6969

7070
static VALUE
7171
ast_parse_new(void)
@@ -85,32 +85,33 @@ ast_parse_done(rb_ast_t *ast)
8585
}
8686

8787
static VALUE
88-
ast_s_parse(rb_execution_context_t *ec, VALUE module, VALUE str, VALUE keep_script_lines, VALUE error_tolerant)
88+
ast_s_parse(rb_execution_context_t *ec, VALUE module, VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
8989
{
90-
return rb_ast_parse_str(str, keep_script_lines, error_tolerant);
90+
return rb_ast_parse_str(str, keep_script_lines, error_tolerant, keep_tokens);
9191
}
9292

9393
static VALUE
94-
rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant)
94+
rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
9595
{
9696
rb_ast_t *ast = 0;
9797

9898
StringValue(str);
9999
VALUE vparser = ast_parse_new();
100100
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
101101
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
102+
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
102103
ast = rb_parser_compile_string_path(vparser, Qnil, str, 1);
103104
return ast_parse_done(ast);
104105
}
105106

106107
static VALUE
107-
ast_s_parse_file(rb_execution_context_t *ec, VALUE module, VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
108+
ast_s_parse_file(rb_execution_context_t *ec, VALUE module, VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
108109
{
109-
return rb_ast_parse_file(path, keep_script_lines, error_tolerant);
110+
return rb_ast_parse_file(path, keep_script_lines, error_tolerant, keep_tokens);
110111
}
111112

112113
static VALUE
113-
rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
114+
rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
114115
{
115116
VALUE f;
116117
rb_ast_t *ast = 0;
@@ -122,6 +123,7 @@ rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
122123
VALUE vparser = ast_parse_new();
123124
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
124125
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
126+
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
125127
ast = rb_parser_compile_file_path(vparser, Qnil, f, 1);
126128
rb_io_close(f);
127129
return ast_parse_done(ast);
@@ -141,14 +143,15 @@ lex_array(VALUE array, int index)
141143
}
142144

143145
static VALUE
144-
rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant)
146+
rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
145147
{
146148
rb_ast_t *ast = 0;
147149

148150
array = rb_check_array_type(array);
149151
VALUE vparser = ast_parse_new();
150152
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
151153
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
154+
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
152155
ast = rb_parser_compile_generic(vparser, lex_array, Qnil, array, 1);
153156
return ast_parse_done(ast);
154157
}
@@ -208,7 +211,7 @@ node_id_for_backtrace_location(rb_execution_context_t *ec, VALUE module, VALUE l
208211
}
209212

210213
static VALUE
211-
ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script_lines, VALUE error_tolerant)
214+
ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
212215
{
213216
VALUE node, lines = Qnil;
214217
const rb_iseq_t *iseq;
@@ -247,13 +250,13 @@ ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script
247250
}
248251

249252
if (!NIL_P(lines) || !NIL_P(lines = script_lines(path))) {
250-
node = rb_ast_parse_array(lines, keep_script_lines, error_tolerant);
253+
node = rb_ast_parse_array(lines, keep_script_lines, error_tolerant, keep_tokens);
251254
}
252255
else if (e_option) {
253-
node = rb_ast_parse_str(rb_e_script, keep_script_lines, error_tolerant);
256+
node = rb_ast_parse_str(rb_e_script, keep_script_lines, error_tolerant, keep_tokens);
254257
}
255258
else {
256-
node = rb_ast_parse_file(path, keep_script_lines, error_tolerant);
259+
node = rb_ast_parse_file(path, keep_script_lines, error_tolerant, keep_tokens);
257260
}
258261

259262
return node_find(node, node_id);
@@ -715,6 +718,15 @@ ast_node_last_column(rb_execution_context_t *ec, VALUE self)
715718
return INT2NUM(nd_last_column(data->node));
716719
}
717720

721+
static VALUE
722+
ast_node_all_tokens(rb_execution_context_t *ec, VALUE self)
723+
{
724+
struct ASTNodeData *data;
725+
TypedData_Get_Struct(self, struct ASTNodeData, &rb_node_type, data);
726+
727+
return rb_ast_tokens(data->ast);
728+
}
729+
718730
static VALUE
719731
ast_node_inspect(rb_execution_context_t *ec, VALUE self)
720732
{

ast.rb

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ module RubyVM::AbstractSyntaxTree
2929
#
3030
# RubyVM::AbstractSyntaxTree.parse("x = 1 + 2")
3131
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-1:9>
32-
def self.parse string, keep_script_lines: false, error_tolerant: false
33-
Primitive.ast_s_parse string, keep_script_lines, error_tolerant
32+
def self.parse string, keep_script_lines: false, error_tolerant: false, keep_tokens: false
33+
Primitive.ast_s_parse string, keep_script_lines, error_tolerant, keep_tokens
3434
end
3535

3636
# call-seq:
@@ -44,8 +44,8 @@ def self.parse string, keep_script_lines: false, error_tolerant: false
4444
#
4545
# RubyVM::AbstractSyntaxTree.parse_file("my-app/app.rb")
4646
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-31:3>
47-
def self.parse_file pathname, keep_script_lines: false, error_tolerant: false
48-
Primitive.ast_s_parse_file pathname, keep_script_lines, error_tolerant
47+
def self.parse_file pathname, keep_script_lines: false, error_tolerant: false, keep_tokens: false
48+
Primitive.ast_s_parse_file pathname, keep_script_lines, error_tolerant, keep_tokens
4949
end
5050

5151
# call-seq:
@@ -63,8 +63,8 @@ def self.parse_file pathname, keep_script_lines: false, error_tolerant: false
6363
#
6464
# RubyVM::AbstractSyntaxTree.of(method(:hello))
6565
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-3:3>
66-
def self.of body, keep_script_lines: false, error_tolerant: false
67-
Primitive.ast_s_of body, keep_script_lines, error_tolerant
66+
def self.of body, keep_script_lines: false, error_tolerant: false, keep_tokens: false
67+
Primitive.ast_s_of body, keep_script_lines, error_tolerant, keep_tokens
6868
end
6969

7070
# call-seq:
@@ -136,6 +136,46 @@ def last_column
136136
Primitive.ast_node_last_column
137137
end
138138

139+
# call-seq:
140+
# node.tokens -> array
141+
#
142+
# Returns tokens corresponding to the location of the node.
143+
# Returns nil if keep_tokens is not enabled when parse method is called.
144+
# Token is an array of:
145+
#
146+
# - id
147+
# - token type
148+
# - source code text
149+
# - location [first_lineno, first_column, last_lineno, last_column]
150+
#
151+
# root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
152+
# root.tokens # => [[0, :ident, "x", [1, 0, 1, 1]], [1, :sp, " ", [1, 1, 1, 2]], ...]
153+
# root.tokens.map{_1[2]}.join # => "x = 1 + 2"
154+
def tokens
155+
return nil unless all_tokens
156+
157+
all_tokens.each_with_object([]) do |token, a|
158+
loc = token.last
159+
if ([first_lineno, first_column] <=> [loc[0], loc[1]]) <= 0 &&
160+
([last_lineno, last_column] <=> [loc[2], loc[3]]) >= 0
161+
a << token
162+
end
163+
end
164+
end
165+
166+
# call-seq:
167+
# node.all_tokens -> array
168+
#
169+
# Returns all tokens for the input script regardless the receiver node.
170+
# Returns nil if keep_tokens is not enabled when parse method is called.
171+
#
172+
# root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
173+
# root.all_tokens # => [[0, :ident, "x", [1, 0, 1, 1]], [1, :sp, " ", [1, 1, 1, 2]], ...]
174+
# root.children[-1].all_tokens # => [[0, :ident, "x", [1, 0, 1, 1]], [1, :sp, " ", [1, 1, 1, 2]], ...]
175+
def all_tokens
176+
Primitive.ast_node_all_tokens
177+
end
178+
139179
# call-seq:
140180
# node.children -> array
141181
#

ext/ripper/eventids2.c

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,3 @@
1-
enum {
2-
tIGNORED_NL = tLAST_TOKEN + 1,
3-
# define tIGNORED_NL ((enum yytokentype)tIGNORED_NL)
4-
tCOMMENT,
5-
# define tCOMMENT ((enum yytokentype)tCOMMENT)
6-
tEMBDOC_BEG,
7-
# define tEMBDOC_BEG ((enum yytokentype)tEMBDOC_BEG)
8-
tEMBDOC,
9-
# define tEMBDOC ((enum yytokentype)tEMBDOC)
10-
tEMBDOC_END,
11-
# define tEMBDOC_END ((enum yytokentype)tEMBDOC_END)
12-
tHEREDOC_BEG,
13-
# define tHEREDOC_BEG ((enum yytokentype)tHEREDOC_BEG)
14-
tHEREDOC_END,
15-
# define tHEREDOC_END ((enum yytokentype)tHEREDOC_END)
16-
k__END__,
17-
# define k__END__ ((enum yytokentype)k__END__)
18-
};
19-
201
typedef struct {
212
ID ripper_id_backref;
223
ID ripper_id_backtick;

internal/parse.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ VALUE rb_parser_set_yydebug(VALUE, VALUE);
1616
void *rb_parser_load_file(VALUE parser, VALUE name);
1717
void rb_parser_keep_script_lines(VALUE vparser);
1818
void rb_parser_error_tolerant(VALUE vparser);
19+
void rb_parser_keep_tokens(VALUE vparser);
1920

2021
RUBY_SYMBOL_EXPORT_BEGIN
2122
VALUE rb_parser_set_context(VALUE, const struct rb_iseq_struct *, int);

node.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1161,6 +1161,12 @@ struct node_buffer_struct {
11611161
node_buffer_list_t markable;
11621162
struct rb_ast_local_table_link *local_tables;
11631163
VALUE mark_hash;
1164+
// - id (sequence number)
1165+
// - token_type
1166+
// - text of token
1167+
// - location info
1168+
// Array, whose entry is array
1169+
VALUE tokens;
11641170
};
11651171

11661172
static void
@@ -1187,6 +1193,7 @@ rb_node_buffer_new(void)
11871193
init_node_buffer_list(&nb->markable, (node_buffer_elem_t*)((size_t)nb->unmarkable.head + bucket_size));
11881194
nb->local_tables = 0;
11891195
nb->mark_hash = Qnil;
1196+
nb->tokens = Qnil;
11901197
return nb;
11911198
}
11921199

@@ -1418,7 +1425,10 @@ rb_ast_update_references(rb_ast_t *ast)
14181425
void
14191426
rb_ast_mark(rb_ast_t *ast)
14201427
{
1421-
if (ast->node_buffer) rb_gc_mark(ast->node_buffer->mark_hash);
1428+
if (ast->node_buffer) {
1429+
rb_gc_mark(ast->node_buffer->mark_hash);
1430+
rb_gc_mark(ast->node_buffer->tokens);
1431+
}
14221432
if (ast->body.compile_option) rb_gc_mark(ast->body.compile_option);
14231433
if (ast->node_buffer) {
14241434
node_buffer_t *nb = ast->node_buffer;
@@ -1477,3 +1487,15 @@ rb_ast_add_mark_object(rb_ast_t *ast, VALUE obj)
14771487
}
14781488
rb_hash_aset(ast->node_buffer->mark_hash, obj, Qtrue);
14791489
}
1490+
1491+
VALUE
1492+
rb_ast_tokens(rb_ast_t *ast)
1493+
{
1494+
return ast->node_buffer->tokens;
1495+
}
1496+
1497+
void
1498+
rb_ast_set_tokens(rb_ast_t *ast, VALUE tokens)
1499+
{
1500+
RB_OBJ_WRITE(ast, &ast->node_buffer->tokens, tokens);
1501+
}

node.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,8 @@ void rb_ast_dispose(rb_ast_t*);
421421
void rb_ast_free(rb_ast_t*);
422422
size_t rb_ast_memsize(const rb_ast_t*);
423423
void rb_ast_add_mark_object(rb_ast_t*, VALUE);
424+
void rb_ast_set_tokens(rb_ast_t*, VALUE);
425+
VALUE rb_ast_tokens(rb_ast_t *ast);
424426
NODE *rb_ast_newnode(rb_ast_t*, enum node_type type);
425427
void rb_ast_delete_node(rb_ast_t*, NODE *n);
426428
rb_ast_id_table_t *rb_ast_new_local_table(rb_ast_t*, int);

0 commit comments

Comments
 (0)