Merge pull request #455 from robinmaisch/R

Add R language frontend
jplag · Jun 27, 2022 · 690c228 · 690c228
2 parents 6a7ba9b + 63e9f00
commit 690c228
Show file tree

Hide file tree

Showing 16 changed files with 1,368 additions and 1 deletion.
diff --git a/jplag.frontend.rlang/README.md b/jplag.frontend.rlang/README.md
@@ -0,0 +1,16 @@
+# JPlag R language frontend
+
+The JPlag R frontend allows the use of JPlag with submissions in R. <br>
+It was in part adapted from a [JPLag fork by CodeGra-de](https://github.com/CodeGra-de/jplag/tree/master/jplag.frontend.R).
+
+### R specification compatibility
+The underlying [grammar definition](https://github.com/antlr/grammars-v4/tree/master/r) was first created in June 2013, when R 3.0.1 was current. The latest commit is from April 2018, when R 3.5.0 was just released. Whether the grammar has been made to comply with any specific version of the R specification is unclear. Even if some parsing errors occur, the parser should be able to recover and still produce a valid analysis.
+
+### Token Extraction
+
+The choice of tokens is based directly on the CodeGra-de version, whereas the extraction process itself contains some fixes.
+
+Like in other frontends, e.g. for Java and C#, the tokens account for the beginning and the end of control flow structures, for control flow keywords, and some kinds of expressions. As R is very different from other programming languages in JPlag, it remains to be seen whether the R frontend can hold up to the others.
+
+### Usage
+To use the R frontend, add the `-l R` flag in the CLI, or use a `JPlagOption` object set to `LanguageOption.R` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).
diff --git a/jplag.frontend.rlang/pom.xml b/jplag.frontend.rlang/pom.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>de.jplag</groupId>
+        <artifactId>aggregator</artifactId>
+        <version>${revision}</version>
+    </parent>
+    <artifactId>rlang</artifactId>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.antlr</groupId>
+            <artifactId>antlr4-runtime</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>de.jplag</groupId>
+            <artifactId>frontend-utils</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>de.jplag</groupId>
+            <artifactId>frontend-testutils</artifactId>
+            <version>${revision}</version>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.antlr</groupId>
+                <artifactId>antlr4-maven-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>antlr4</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+</project>
diff --git a/jplag.frontend.rlang/src/main/antlr4/de/jplag/R/grammar/R.g4 b/jplag.frontend.rlang/src/main/antlr4/de/jplag/R/grammar/R.g4
@@ -0,0 +1,216 @@
+/*
+ [The "BSD licence"]
+ Copyright (c) 2013 Terence Parr
+ All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. The name of the author may not be used to endorse or promote products
+    derived from this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+derived from http://svn.r-project.org/R/trunk/src/main/gram.y
+http://cran.r-project.org/doc/manuals/R-lang.html#Parser
+I'm no R genius but this seems to work.
+Requires RFilter.g4 to strip away NL that are really whitespace,
+not end-of-command. See TestR.java
+Usage:
+$ antlr4 R.g4 RFilter.g4
+$ javac *.java
+$ java TestR sample.R
+... prints parse tree ...
+*/
+
+/*
+Modified version of the original in https://github.com/antlr/grammars-v4/blob/master/r/R.g4 so that I can separate the most relevant tokens of R in 
+the JplagRListenter.java file.
+Author of the modification: Antonio Javier Rodriguez Perez
+*/
+
+grammar R;
+
+prog:   (   expr (';'|NL)
+        |   NL
+        )*
+        EOF
+    ;
+
+/*
+expr_or_assign
+    :   expr ('<-'|'='|'<<-') expr_or_assign
+    |   expr
+    ;
+*/
+
+expr:   expr index_statement  // '[[' follows R's yacc grammar
+    |   expr access_package expr
+    |   expr ('$'|'@') expr
+    |   <assoc=right> expr '^' expr
+    |   ('-'|'+') expr
+    |   expr ':' expr
+    |   expr USER_OP expr // anything wrappedin %: '%' .* '%'
+    |   expr ('*'|'/') expr
+    |   expr ('+'|'-') expr
+    |   expr ('>'|'>='|'<'|'<='|'=='|'!=') expr
+    |   '!' expr
+    |   expr ('&'|'&&') expr
+    |   expr ('|'|'||') expr
+    |   '~' expr
+    |   expr '~' expr
+    |   expr assign_value expr
+    |   function_definition                 // define function
+    |   expr function_call                  // call function
+    |   compound_statement
+    |   if_statement
+    |   for_statement
+    |   while_statement
+    |   repeat_statement
+    |   help
+    |   next_statement
+    |   break_statement
+    |   '(' expr ')'
+    |   ID
+    |   constant
+    ;
+
+index_statement : '[[' sublist ']' ']' | '[' sublist ']' ;
+
+access_package: '::'|':::' ;
+
+function_definition: 'function' '(' formlist? ')' expr ;
+
+function_call : '(' sublist ')' ;
+
+constant: constant_number | constant_string | constant_bool | 'NULL' | 'NA' | 'Inf' | 'NaN' ;
+
+constant_number: HEX | INT | FLOAT | COMPLEX ;
+
+constant_string: STRING ;
+
+constant_bool: 'TRUE' | 'FALSE' ;
+
+help: '?' expr ; // get help on expr, usually string or ID
+
+if_statement :  'if' '(' expr ')' expr | 'if' '(' expr ')' expr 'else' expr ;
+
+for_statement : 'for' '(' ID 'in' expr ')' expr ;
+
+while_statement : 'while' '(' expr ')' expr ;
+
+repeat_statement: 'repeat' expr ;
+
+next_statement: 'next' ;
+
+break_statement: 'break' ;
+
+compound_statement: '{' exprlist '}' ;
+
+exprlist
+    :   expr ((';'|NL) expr?)*
+    |
+    ;
+
+formlist : form (',' form)* ;
+
+form:   ID
+    |   assign_func_declaration
+    ;
+
+sublist : sub (',' sub)* ;
+
+sub :   expr
+    |   assign_value_list
+    |
+    ;
+
+assign_value: '<-'|'<<-'|'='|'->'|'->>'|':=';
+
+assign_func_declaration: ID '=' expr | '...' ;
+
+assign_value_list: ID '=' | ID '=' expr | constant_string '=' | constant_string '=' expr | 'NULL' '=' | 'NULL' '=' expr | '...' ;
+
+
+
+HEX :   '0' ('x'|'X') HEXDIGIT+ [Ll]? ;
+
+INT :   DIGIT+ [Ll]? ;
+
+fragment
+HEXDIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
+
+FLOAT:  DIGIT+ '.' DIGIT* EXP? [Ll]?
+    |   DIGIT+ EXP? [Ll]?
+    |   '.' DIGIT+ EXP? [Ll]?
+    ;
+
+fragment
+DIGIT:  '0'..'9' ;
+
+fragment
+EXP :   ('E' | 'e') ('+' | '-')? INT ;
+
+COMPLEX
+    :   INT 'i'
+    |   FLOAT 'i'
+    ;
+
+STRING
+    :   '"' ( ESC | ~[\\"] )*? '"'
+    |   '\'' ( ESC | ~[\\'] )*? '\''
+    |   '`' ( ESC | ~[\\'] )*? '`'
+    ;
+fragment
+ESC :   '\\' [abtnfrv"'\\]
+    |   UNICODE_ESCAPE
+    |   HEX_ESCAPE
+    |   OCTAL_ESCAPE
+    ;
+
+fragment
+UNICODE_ESCAPE
+    :   '\\' 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
+    |   '\\' 'u' '{' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT '}'
+    ;
+
+fragment
+OCTAL_ESCAPE
+    :   '\\' [0-3] [0-7] [0-7]
+    |   '\\' [0-7] [0-7]
+    |   '\\' [0-7]
+    ;
+
+fragment
+HEX_ESCAPE
+    :   '\\' HEXDIGIT HEXDIGIT?
+    ;
+
+ID  :   '.' (LETTER|'_'|'.') (LETTER|DIGIT|'_'|'.')*
+    |   LETTER (LETTER|DIGIT|'_'|'.')*
+    ;
+
+fragment LETTER  : [a-zA-Z] ;
+
+USER_OP :   '%' .*? '%' ;
+
+COMMENT :   '#' .*? '\r'? '\n' -> type(NL) ;
+
+// Match both UNIX and Windows newlines
+NL      :   '\r'? '\n' ;
+
+WS      :   [ \t\u000C]+ -> skip ;
diff --git a/jplag.frontend.rlang/src/main/antlr4/de/jplag/R/grammar/RFilter.g4 b/jplag.frontend.rlang/src/main/antlr4/de/jplag/R/grammar/RFilter.g4
@@ -0,0 +1,83 @@
+/*
+ [The "BSD licence"]
+ Copyright (c) 2013 Terence Parr
+ All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. The name of the author may not be used to endorse or promote products
+    derived from this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/** Must process R input with this before passing to R.g4; see TestR.java
+    We strip NL inside expressions.
+ */
+
+parser grammar RFilter;
+
+options { tokenVocab=R; }
+
+@members {
+protected int curlies = 0;
+}
+
+// TODO: MAKE THIS GET ONE COMMAND ONLY
+stream : (element|NL|';')* EOF ;
+
+eat :   (NL {((WritableToken)$NL).setChannel(Token.HIDDEN_CHANNEL);})+ ;
+
+element:   op eat?
+    |   atom
+    |   '{' eat? {curlies++;} (element|NL|';')* {curlies--;} '}'
+    |   '(' (element|eat)* ')'
+    |   '[' (element|eat)* ']'
+    |   '[[' (element|eat)* ']' ']'
+    |   'function' eat? '(' (element|eat)* ')' eat?
+    |   'for' eat? '(' (element|eat)* ')' eat?
+    |   'while' eat? '(' (element|eat)* ')' eat?
+    |   'if' eat? '(' (element|eat)* ')' eat?
+    |   'else'
+        {
+        // ``inside a compound expression, a newline before else is discarded,
+        // whereas at the outermost level, the newline terminates the if
+        // construction and a subsequent else causes a syntax error.''
+        /*
+        Works here
+            if (1==0) { print(1) } else { print(2) }
+        and correctly gets error here:
+            if (1==0) { print(1) }
+            else { print(2) }
+        this works too:
+            if (1==0) {
+              if (2==0) print(1)
+              else print(2)
+            }
+        */
+        WritableToken tok = (WritableToken)_input.LT(-2);
+        if (curlies>0&&tok.getType()==NL) tok.setChannel(Token.HIDDEN_CHANNEL);
+        }
+    ;
+
+atom:   'next' | 'break' | ID | STRING | HEX | INT | FLOAT | COMPLEX | 'NULL'
+    |   'NA' | 'Inf' | 'NaN' | 'TRUE' | 'FALSE'
+    ;
+
+op  :   '+'|'-'|'*'|'/'|'^'|'<'|'<='|'>='|'>'|'=='|'!='|'&'|'&&'|USER_OP|
+        'repeat'|'in'|'?'|'!'|'='|':'|'~'|'$'|'@'|'<-'|'->'|'='|'::'|':::'|
+        ','|'...'|'||'| '|'
+    ;