From b4e19e12a912164833c76139908f1c3717702258 Mon Sep 17 00:00:00 2001 From: brandonspark Date: Mon, 25 Mar 2024 20:37:57 -0700 Subject: [PATCH 1/5] feat: fix newlines in class decls --- .../src/semgrep-kotlin/grammar.js | 20 +- .../semgrep-kotlin/test/corpus/semgrep.txt | 178 ++++++++++++++---- 2 files changed, 162 insertions(+), 36 deletions(-) diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js index 17df70aa..dc897c91 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js @@ -6,7 +6,7 @@ */ // INVARIATN: Make sure that you are merging any commits into the `semgrep` -// branch of `tree-sitter-kotlin`! This is because our version of +// branch of `tree-sitter-kotlin`! This is because our version of // `tree-sitter-kotlin` is forked from the original repository, and we // want our branch to be kept separate. @@ -40,7 +40,7 @@ module.exports = grammar(standard_grammar, { typed_metavar: $ => seq( "(", $.simple_identifier, ":", $._type, ")" - ), + ), // Statement ellipsis: '...' not followed by ';' _expression: ($, previous) => { @@ -60,6 +60,8 @@ module.exports = grammar(standard_grammar, { ); }, + secondary_constructor: ($, previous) => prec(500, previous), + _class_member_declaration: ($, previous) => { return choice( previous, @@ -74,13 +76,25 @@ module.exports = grammar(standard_grammar, { ); }, + _statement: ($, previous) => choice( + previous, + prec.left(1000, seq( + optional($.type_parameters), + seq(optional($.modifiers), "constructor"), + prec(5, $._class_parameters), + optional(seq(":", $._delegation_specifiers)), + optional($.type_constraints), + optional($.class_body) + )) + ), + class_parameter: ($, previous) => { return choice( previous, $.ellipsis ); }, - + deep_ellipsis: $ => seq( '<...', $._expression, '...>' ), diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt index f12c8c49..d87a8843 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt +++ b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt @@ -32,7 +32,6 @@ class $CLASS { (value_argument (integer_literal))))))))))) - ===================================== Typed Metavariables ===================================== @@ -54,7 +53,7 @@ val x = ($X : int) Ellipsis Metavariable ===================================== -$...X +$...X val x = 2 --- @@ -67,7 +66,7 @@ val x = 2 (integer_literal))) ===================================== -Expression Ellipsis +Expression Ellipsis ===================================== class Foo { @@ -89,7 +88,6 @@ class Foo { (statements (ellipsis))))))) - ===================================== Deep Ellipsis ===================================== @@ -132,9 +130,12 @@ class Foo { } --- + (source_file - (class_declaration (type_identifier) - (class_body (ellipsis)))) + (class_declaration + (type_identifier) + (class_body + (ellipsis)))) ===================================== Argument Ellipsis @@ -142,12 +143,18 @@ Argument Ellipsis foo(1, ..., 2) --- + (source_file - (call_expression (simple_identifier) - (call_suffix (value_arguments - (value_argument (integer_literal)) - (value_argument (ellipsis)) - (value_argument (integer_literal)))))) + (call_expression + (simple_identifier) + (call_suffix + (value_arguments + (value_argument + (integer_literal)) + (value_argument + (ellipsis)) + (value_argument + (integer_literal)))))) ===================================== Parameter Ellipsis @@ -157,12 +164,16 @@ fun foo(..., bar: String, ...) {} --- (source_file - (function_declaration (simple_identifier) - (function_value_parameters + (function_declaration + (simple_identifier) + (function_value_parameters (ellipsis) - (parameter (simple_identifier) (user_type (type_identifier))) + (parameter + (simple_identifier) + (user_type + (type_identifier))) (ellipsis)) - (function_body))) + (function_body))) ===================================== Class Parameter Ellipsis @@ -171,11 +182,17 @@ class Foo(...) : Filter { } --- + (source_file - (class_declaration (type_identifier) - (primary_constructor (class_parameter (ellipsis))) - (delegation_specifier (user_type (type_identifier))) - (class_body))) + (class_declaration + (type_identifier) + (primary_constructor + (class_parameter + (ellipsis))) + (delegation_specifier + (user_type + (type_identifier))) + (class_body))) ===================================== Statement Ellipsis @@ -187,15 +204,22 @@ fun foo() { } --- + (source_file - (function_declaration (simple_identifier) + (function_declaration + (simple_identifier) (function_value_parameters) (function_body (statements - (call_expression (simple_identifier) (call_suffix (value_arguments))) - (ellipsis) - (call_expression (simple_identifier) (call_suffix (value_arguments)))) - ))) + (call_expression + (simple_identifier) + (call_suffix + (value_arguments))) + (ellipsis) + (call_expression + (simple_identifier) + (call_suffix + (value_arguments))))))) ===================================== Statement Ellipsis and metavarible @@ -205,11 +229,17 @@ $COOKIE.setValue("") ... --- + (source_file (call_expression - (navigation_expression (simple_identifier) - (navigation_suffix (simple_identifier))) - (call_suffix (value_arguments (value_argument (string_literal))))) + (navigation_expression + (simple_identifier) + (navigation_suffix + (simple_identifier))) + (call_suffix + (value_arguments + (value_argument + (string_literal))))) (ellipsis)) ===================================== @@ -218,14 +248,96 @@ Method chaining Ellipsis obj. ... .foo().bar() --- + (source_file (call_expression (navigation_expression (call_expression (navigation_expression - (navigation_expression (simple_identifier) - (navigation_suffix (ellipsis))) - (navigation_suffix (simple_identifier))) - (call_suffix (value_arguments))) - (navigation_suffix (simple_identifier))) - (call_suffix (value_arguments)))) + (navigation_expression + (simple_identifier) + (navigation_suffix + (ellipsis))) + (navigation_suffix + (simple_identifier))) + (call_suffix + (value_arguments))) + (navigation_suffix + (simple_identifier))) + (call_suffix + (value_arguments)))) + +===================================== +Class with a newline +===================================== + +class BlahClass +constructor(arg: Int) { + private val blah2 = "foo${arg}" +} + +--- + +(source_file + (class_declaration + (type_identifier)) + (class_parameter + (simple_identifier) + (user_type + (type_identifier))) + (class_body + (property_declaration + (modifiers + (visibility_modifier)) + (variable_declaration + (simple_identifier)) + (string_literal + (interpolated_expression + (simple_identifier)))))) + +===================================== +Class in a class +===================================== + +class Foo { + class BlahClass + constructor(arg: Int) { + private val blah2 = "foo${arg}" + } +} + +--- + +(source_file + (class_declaration + (type_identifier) + (class_body + (class_declaration + (type_identifier)) + (secondary_constructor + (function_value_parameters + (parameter + (simple_identifier) + (user_type + (type_identifier)))) + (statements + (property_declaration + (modifiers + (visibility_modifier)) + (variable_declaration + (simple_identifier)) + (string_literal + (interpolated_expression + (simple_identifier))))))))) + +===================================== +Class without body +===================================== + +class Foo + +--- + +(source_file + (class_declaration + (type_identifier))) From 68235d54a9484e81b2c9f4256e4e326395e47a1e Mon Sep 17 00:00:00 2001 From: brandonspark Date: Mon, 25 Mar 2024 20:53:05 -0700 Subject: [PATCH 2/5] fix: add some comments and simplify --- .../src/semgrep-kotlin/grammar.js | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js index dc897c91..03a58a53 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js @@ -60,8 +60,6 @@ module.exports = grammar(standard_grammar, { ); }, - secondary_constructor: ($, previous) => prec(500, previous), - _class_member_declaration: ($, previous) => { return choice( previous, @@ -76,12 +74,28 @@ module.exports = grammar(standard_grammar, { ); }, + // We would like to be able to parse programs which have a newline between the + // class name and the constructor: + // class Foo + // constructor Bar() { ... } + + // The problem is that the Kotlin parser inserts a semicolon after "Foo", making + // it such that we get interrupted in the middle of the class_declaration. + // To make it so we can continue, we allow everything after the class identifier + // to be a standalone statement in its own right. This way, we can parse both parts + // individually, and stitch them together at parsing time. + + // We only need to amend statements here, because the consumers of _declaration are + // only class_member_declaration, top_level_object and _statement. + // The former has `secondary_constructor`, which already looks like what we want to + // add, and the second seems to be unused. + // So we just need to fix _statement. _statement: ($, previous) => choice( previous, - prec.left(1000, seq( + prec.left(seq( optional($.type_parameters), seq(optional($.modifiers), "constructor"), - prec(5, $._class_parameters), + $._class_parameters, optional(seq(":", $._delegation_specifiers)), optional($.type_constraints), optional($.class_body) From ea10bc49066ec6daf070dab7bb49efaa7ed669f5 Mon Sep 17 00:00:00 2001 From: Amarin Phaosawasdi Date: Tue, 26 Mar 2024 12:30:33 -0700 Subject: [PATCH 3/5] group params --- .../src/semgrep-kotlin/grammar.js | 8 +++--- .../semgrep-kotlin/test/corpus/semgrep.txt | 27 ++++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js index 03a58a53..a5063a88 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js @@ -92,15 +92,17 @@ module.exports = grammar(standard_grammar, { // So we just need to fix _statement. _statement: ($, previous) => choice( previous, - prec.left(seq( + $.partial_class_declaration, + ), + + partial_class_declaration: $ => prec.left(seq( optional($.type_parameters), seq(optional($.modifiers), "constructor"), $._class_parameters, optional(seq(":", $._delegation_specifiers)), optional($.type_constraints), optional($.class_body) - )) - ), + )), class_parameter: ($, previous) => { return choice( diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt index d87a8843..b4f6687d 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt +++ b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt @@ -281,19 +281,20 @@ constructor(arg: Int) { (source_file (class_declaration (type_identifier)) - (class_parameter - (simple_identifier) - (user_type - (type_identifier))) - (class_body - (property_declaration - (modifiers - (visibility_modifier)) - (variable_declaration - (simple_identifier)) - (string_literal - (interpolated_expression - (simple_identifier)))))) + (partial_class_declaration + (class_parameter + (simple_identifier) + (user_type + (type_identifier))) + (class_body + (property_declaration + (modifiers + (visibility_modifier)) + (variable_declaration + (simple_identifier)) + (string_literal + (interpolated_expression + (simple_identifier))))))) ===================================== Class in a class From 2f77e1b933fd416f429481ceabaf8b999d7b2005 Mon Sep 17 00:00:00 2001 From: Amarin Phaosawasdi Date: Tue, 26 Mar 2024 19:21:56 -0700 Subject: [PATCH 4/5] refactor grammar a bit more --- .../src/semgrep-kotlin/grammar.js | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js index a5063a88..0c3886a4 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js @@ -91,18 +91,18 @@ module.exports = grammar(standard_grammar, { // add, and the second seems to be unused. // So we just need to fix _statement. _statement: ($, previous) => choice( - previous, - $.partial_class_declaration, + ...previous.members, + $.partial_class_declaration, ), - partial_class_declaration: $ => prec.left(seq( - optional($.type_parameters), - seq(optional($.modifiers), "constructor"), - $._class_parameters, - optional(seq(":", $._delegation_specifiers)), - optional($.type_constraints), - optional($.class_body) - )), + partial_class_declaration: $ => prec.left(seq( + optional($.type_parameters), + seq(optional($.modifiers), "constructor"), + $._class_parameters, + optional(seq(":", $._delegation_specifiers)), + optional($.type_constraints), + optional($.class_body) + )), class_parameter: ($, previous) => { return choice( From c270b2fa93b031c31b750375a6190434feb29d03 Mon Sep 17 00:00:00 2001 From: Amarin Phaosawasdi Date: Wed, 27 Mar 2024 11:52:58 -0700 Subject: [PATCH 5/5] address code review --- lang/semgrep-grammars/src/semgrep-kotlin/grammar.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js index 0c3886a4..fc30abb9 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-kotlin/grammar.js @@ -90,6 +90,11 @@ module.exports = grammar(standard_grammar, { // The former has `secondary_constructor`, which already looks like what we want to // add, and the second seems to be unused. // So we just need to fix _statement. + + // A more proper fix would likely require changes to the external scanner to properly + // handle automatic semicolon insertion, which is the cause of this whole issue. + // We filed an issue to tree-sitter-kotlin to track this: + // https://github.com/fwcd/tree-sitter-kotlin/issues/75 _statement: ($, previous) => choice( ...previous.members, $.partial_class_declaration,