Merge pull request #1545 from jplag/feature/emf-mode-fixes

Improvement of the EMF metamodel and model languages
jplag · Feb 16, 2024 · 156b1da · 156b1da
2 parents d246d6d + c41cd89
commit 156b1da
Show file tree

Hide file tree

Showing 28 changed files with 452 additions and 254 deletions.
diff --git a/core/src/main/java/de/jplag/Submission.java b/core/src/main/java/de/jplag/Submission.java
@@ -239,6 +239,8 @@ private static File createErrorDirectory(String... subdirectoryNames) {
 
     /**
      * Parse files of the submission.
+     * @param debugParser specifies if the submission should be copied upon parsing errors.
+     * @param normalize specifies if the tokens sequences should be normalized.
      * @return Whether parsing was successful.
      */
     /* package-private */ boolean parse(boolean debugParser, boolean normalize) {
@@ -276,7 +278,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
     }
 
     /**
-     * Perform token string normalization, which makes the token string invariant to dead code insertion and independent
+     * Perform token sequence normalization, which makes the token sequence invariant to dead code insertion and independent
      * statement reordering.
      */
     void normalize() {

diff --git a/core/src/main/java/de/jplag/exceptions/ConfigurationException.java b/core/src/main/java/de/jplag/exceptions/ConfigurationException.java
@@ -1,9 +1,14 @@
 package de.jplag.exceptions;
 
+import java.io.Serial;
+
 /**
  * Exceptions used if configuration is wrong.
  */
 public class ConfigurationException extends ExitException {
+    @Serial
+    private static final long serialVersionUID = 4625302641982932127L; // generated
+
     public ConfigurationException(String message) {
         super(message);
     }

diff --git a/core/src/main/java/de/jplag/normalization/TokenStringNormalizer.java b/core/src/main/java/de/jplag/normalization/TokenStringNormalizer.java
@@ -13,19 +13,19 @@
 import de.jplag.Token;
 
 /**
- * Performs token string normalization.
+ * Performs token sequence normalization.
  */
 public class TokenStringNormalizer {
 
     private TokenStringNormalizer() {
     }
 
     /**
-     * Performs token string normalization. Tokens representing dead code have been eliminated and tokens representing
+     * Performs token sequence normalization. Tokens representing dead code have been eliminated and tokens representing
      * subsequent independent statements have been put in a fixed order. Works by first constructing a Normalization Graph
-     * and then turning it back into a token string.
-     * @param tokens The original token string, remains unaltered.
-     * @return The normalized token string.
+     * and then turning it back into a token sequence.
+     * @param tokens The original token sequence, remains unaltered.
+     * @return The normalized token sequence.
      */
     public static List<Token> normalize(List<Token> tokens) {
         SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph = new NormalizationGraphConstructor(tokens).get();

diff --git a/core/src/main/java/de/jplag/reporting/jsonfactory/serializer/LanguageSerializer.java b/core/src/main/java/de/jplag/reporting/jsonfactory/serializer/LanguageSerializer.java
@@ -1,6 +1,7 @@
 package de.jplag.reporting.jsonfactory.serializer;
 
 import java.io.IOException;
+import java.io.Serial;
 
 import de.jplag.Language;
 
@@ -10,6 +11,9 @@
 
 public class LanguageSerializer extends StdSerializer<Language> {
 
+    @Serial
+    private static final long serialVersionUID = 5944655736767387268L; // generated
+
     /**
      * Constructor used by the fasterxml.jackson
      */

diff --git a/language-api/src/main/java/de/jplag/Language.java b/language-api/src/main/java/de/jplag/Language.java
@@ -108,7 +108,8 @@ default List<File> customizeSubmissionOrder(List<File> submissions) {
     }
 
     /**
-     * @return True, if tokens for this language can be normalized
+     * @return True, if this language supports token sequence normalization. This does not include other normalization
+     * mechanisms that might be part of the language modules.
      */
     default boolean supportsNormalization() {
         return false;

diff --git a/languages/emf-metamodel-dynamic/README.md b/languages/emf-metamodel-dynamic/README.md
@@ -1,5 +1,5 @@
 # Dynamic EMF metamodel language module
-The dynamic EMF metamodel language module allows the use of JPlag with metamodel submissions.
+The dynamic EMF metamodel language module allows the use of JPlag with EMF metamodel submissions.
 It is based on the EMF API.
 
 ### EMF specification compatibility
@@ -9,8 +9,14 @@ This module is based on the EMF dependencies available on maven central. These m
 For the token extraction, we visit the containment tree of the metamodel and extract tokens for all metamodel elements based on their concrete metaclass. In this module, we thus extract tokens based on a dynamic token set.
 
 ### Usage
-To use this module, add the `-l emf-metamodel-dynamic` flag in the CLI, or use a `JPlagOption` object with `new DynamicEmfLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).
+Note that this language module is currently not offered via the CLI.
+Use the non-dymamic version instead (`-l emf`).
 
-### More Info
-More information can be found in the paper [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
-A short summary can be found on [Kudos](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
+### Report Viewer
+In the report viewer, Emfatic is used as a textual model view.
+
+### Literature
+* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
+* Its [Kudos Summary](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
+* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*]
+* *"Automated Detection of AI-Obfuscated Plagiarism in Modeling Assignments" (ICSE-SEET'24)*
diff --git a/...-metamodel-dynamic/src/main/java/de/jplag/emf/dynamic/parser/DynamicElementTokenizer.java b/...-metamodel-dynamic/src/main/java/de/jplag/emf/dynamic/parser/DynamicElementTokenizer.java
@@ -1,7 +1,7 @@
 package de.jplag.emf.dynamic.parser;
 
-import java.util.HashSet;
-import java.util.Set;
+import java.util.LinkedHashSet;
+import java.util.SequencedSet;
 
 import org.eclipse.emf.ecore.EClass;
 import org.eclipse.emf.ecore.EObject;
@@ -15,14 +15,7 @@
  */
 public class DynamicElementTokenizer implements ModelingElementTokenizer {
 
-    private final Set<TokenType> knownTokenTypes;
-
-    /**
-     * Creates the tokenizer, initially with an empty token set.
-     */
-    public DynamicElementTokenizer() {
-        knownTokenTypes = new HashSet<>();
-    }
+    private static final SequencedSet<TokenType> knownTokenTypes = new LinkedHashSet<>();
 
     @Override
     public TokenType element2Token(EObject modelElement) {
@@ -32,7 +25,7 @@ public TokenType element2Token(EObject modelElement) {
     }
 
     @Override
-    public Set<TokenType> allTokenTypes() {
-        return Set.copyOf(knownTokenTypes);
+    public SequencedSet<TokenType> allTokenTypes() {
+        return new LinkedHashSet<>(knownTokenTypes);
     }
 }
diff --git a/...emf-metamodel-dynamic/src/test/java/de/jplag/emf/dynamic/MinimalDynamicMetamodelTest.java b/...emf-metamodel-dynamic/src/test/java/de/jplag/emf/dynamic/MinimalDynamicMetamodelTest.java
@@ -47,7 +47,7 @@ public void setUp() {
     @DisplayName("Test tokens generated from example metamodels")
     void testBookstoreMetamodels() throws ParsingException {
         List<File> testFiles = Arrays.stream(TEST_SUBJECTS).map(path -> new File(BASE_PATH.toFile(), path)).toList();
-        List<Token> result = language.parse(new HashSet<>(testFiles));
+        List<Token> result = language.parse(new HashSet<>(testFiles), true);
         List<TokenType> tokenTypes = result.stream().map(Token::getType).toList();
         logger.debug(TokenPrinter.printTokens(result, baseDirectory, Optional.of(EmfLanguage.VIEW_FILE_SUFFIX)));
         logger.info("parsed token types: " + tokenTypes.stream().map(TokenType::getDescription).toList());

diff --git a/languages/emf-metamodel/README.md b/languages/emf-metamodel/README.md
@@ -1,5 +1,5 @@
 # EMF metamodel language module
-The EMF metamodel language module allows the use of JPlag with metamodel submissions.
+The EMF metamodel language module allows the use of JPlag with EMF metamodel submissions.
 It is based on the EMF API.
 
 ### EMF specification compatibility
@@ -9,8 +9,14 @@ This module is based on the EMF dependencies available on maven central. These m
 For the token extraction, we visit the containment tree of the metamodel and extract tokens for certain metamodel elements based on their metaclass. In this module, we extract tokens based on a [handcrafted token set](https://github.com/jplag/JPlag/blob/master/languages/emf-metamodel/src/main/java/de/jplag/emf/MetamodelTokenType.java). Note that not for all concrete metaclasses tokens are extracted. `EFactory`, `EGenericType`, and `EObject` are ignored. Moreover, for some metaclasses, multiple token types are extracted. Finally, some references are also used for token extraction.
 
 ### Usage
-To use this module, add the `-l emf-metamodel` flag in the CLI, or use a `JPlagOption` object with `new EmfLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).
+The input for this module is a set of EMF metamodels (`.ecore` files).
+To use this module, add the `-l emf` flag in the CLI, or use a `JPlagOption` object with `new EmfLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).
 
-### More Info
-More information can be found in the paper [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
-A short summary can be found on [Kudos](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
+### Report Viewer
+In the report viewer, Emfatic is used as a textual model view.
+
+### Literature
+* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
+* Its [Kudos Summary](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
+* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*]
+* *"Automated Detection of AI-Obfuscated Plagiarism in Modeling Assignments" (ICSE-SEET'24)*
diff --git a/languages/emf-metamodel/src/main/java/de/jplag/emf/EmfLanguage.java b/languages/emf-metamodel/src/main/java/de/jplag/emf/EmfLanguage.java
@@ -56,7 +56,7 @@ public int minimumTokenMatch() {
 
     @Override
     public List<Token> parse(Set<File> files, boolean normalize) throws ParsingException {
-        return parser.parse(files);
+        return parser.parse(files, normalize);
     }
 
     @Override

diff --git a/...es/emf-metamodel/src/main/java/de/jplag/emf/normalization/ContainmentOrderNormalizer.java b/...es/emf-metamodel/src/main/java/de/jplag/emf/normalization/ContainmentOrderNormalizer.java
@@ -45,9 +45,11 @@ public int compare(EObject first, EObject second) {
         // 0. comparison if token types are absent for one or more elements.
         if (firstType == null && secondType == null) {
             return 0;
-        } else if (firstType == null) {
+        }
+        if (firstType == null) {
             return -1;
-        } else if (secondType == null) {
+        }
+        if (secondType == null) {
             return 1;
         }
 
@@ -91,7 +93,7 @@ private List<EObject> calculatePath(TokenType type) {
         List<EObject> elements = modelElementsToSort.stream().filter(it -> type.equals(tokenizer.element2Token(it))).toList();
 
         // Generate token type distributions for the subtrees of the elements to sort:
-        Map<EObject, List<Double>> subtreeVectors = new HashMap<>();
+        Map<EObject, TokenOccurenceVector> subtreeVectors = new HashMap<>();
         elements.forEach(it -> subtreeVectors.put(it, tokenVectorGenerator.generateOccurenceVector(it.eAllContents())));
 
         // Calculate distance matrix:
@@ -118,15 +120,17 @@ private int countSubtreeTokens(EObject modelElement) {
         return count;
     }
 
-    private static double euclideanDistance(List<Double> first, List<Double> second) {
-        if (first.size() != second.size()) {
-            throw new IllegalArgumentException("Lists must have the same size");
-        }
+    /**
+     * Calculates the euclidean distance for two token occurrence vectors. As they are zero-padded, they are virtually of
+     * the same length.
+     */
+    private static double euclideanDistance(TokenOccurenceVector first, TokenOccurenceVector second) {
         double sum = 0;
         for (int i = 0; i < first.size(); i++) {
             double diff = first.get(i) - second.get(i);
             sum += diff * diff;
         }
         return Math.sqrt(sum);
     }
+
 }
diff --git a/languages/emf-metamodel/src/main/java/de/jplag/emf/normalization/TokenOccurenceVector.java b/languages/emf-metamodel/src/main/java/de/jplag/emf/normalization/TokenOccurenceVector.java
@@ -0,0 +1,39 @@
+package de.jplag.emf.normalization;
+
+import java.util.List;
+
+/**
+ * A vector for the occurrence frequency of different token types. The vector is padded with zeroes beyond its original
+ * size. The vector content cannot be changed after its creation.
+ */
+public class TokenOccurenceVector {
+    private final List<Double> originalVector;
+
+    /**
+     * Creates a zero-padded token occurrence vector.
+     * @param originalVector specifies the occurrence frequency values for the vector.
+     */
+    public TokenOccurenceVector(List<Double> originalVector) {
+        this.originalVector = originalVector;
+    }
+
+    /**
+     * Return a occurrence frequency value of the vector at the specified.
+     * @param index is the specified index.
+     * @return the occurrence frequency value or zero if the index is beyond the size of the vector.
+     */
+    public double get(int index) {
+        if (index >= originalVector.size()) {
+            return 0.0;
+        }
+        return originalVector.get(index);
+    }
+
+    /**
+     * The original size of the vector, without padding.
+     * @return the size.
+     */
+    public int size() {
+        return originalVector.size();
+    }
+}
diff --git a/languages/emf-metamodel/src/main/java/de/jplag/emf/normalization/TokenVectorGenerator.java b/languages/emf-metamodel/src/main/java/de/jplag/emf/normalization/TokenVectorGenerator.java
@@ -10,7 +10,6 @@
 import org.eclipse.emf.ecore.EObject;
 
 import de.jplag.TokenType;
-import de.jplag.emf.MetamodelTokenType;
 import de.jplag.emf.parser.ModelingElementTokenizer;
 
 /**
@@ -27,10 +26,10 @@ public TokenVectorGenerator(ModelingElementTokenizer tokenizer) {
     /**
      * Generate a token occurrence vector for a subtree of a model.
      * @param modelElements is a visitor for the subtree.
-     * @return a list, where each entry represents the number of tokens in the subtree. The order is determined by
-     * {@link MetamodelTokenType}.
+     * @return a zero padded token occurrence vector, where each entry represents the number of tokens in the subtree. The
+     * order is determined by {@link ModelingElementTokenizer#allTokenTypes()}.
      */
-    public List<Double> generateOccurenceVector(Iterator<EObject> modelElements) {
+    public TokenOccurenceVector generateOccurenceVector(Iterator<EObject> modelElements) {
         Map<TokenType, Integer> tokenTypeHistogram = new HashMap<>();
 
         while (modelElements.hasNext()) {
@@ -40,7 +39,7 @@ public List<Double> generateOccurenceVector(Iterator<EObject> modelElements) {
         for (TokenType type : tokenizer.allTokenTypes()) {
             occurenceVector.add(tokenTypeHistogram.getOrDefault(type, 0));
         }
-        return normalize(occurenceVector);
+        return new TokenOccurenceVector(normalize(occurenceVector));
     }
 
     public static List<Double> normalize(List<Integer> vector) {

diff --git a/languages/emf-metamodel/src/main/java/de/jplag/emf/parser/EcoreParser.java b/languages/emf-metamodel/src/main/java/de/jplag/emf/parser/EcoreParser.java
@@ -41,10 +41,10 @@ public EcoreParser() {
      * @param files is the set of files.
      * @return the list of parsed tokens.
      */
-    public List<Token> parse(Set<File> files) throws ParsingException {
+    public List<Token> parse(Set<File> files, boolean normalize) throws ParsingException {
         tokens = new ArrayList<>();
         for (File file : files) {
-            parseModelFile(file);
+            parseModelFile(file, normalize);
         }
         return tokens;
     }
@@ -53,21 +53,22 @@ public List<Token> parse(Set<File> files) throws ParsingException {
      * Loads a metamodel from a file and parses it.
      * @param file is the metamodel file.
      */
-    protected void parseModelFile(File file) throws ParsingException {
+    protected void parseModelFile(File file, boolean normalize) throws ParsingException {
         currentFile = file;
         Resource model = EMFUtil.loadModelResource(file);
         if (model == null) {
             throw new ParsingException(file, "failed to load model");
-        } else {
+        }
+        if (normalize) {
             normalizeOrder(model);
-            treeView = createView(file, model);
-            visitor = createMetamodelVisitor();
-            for (EObject root : model.getContents()) {
-                visitor.visit(root);
-            }
-            tokens.add(Token.fileEnd(currentFile));
-            treeView.writeToFile(getCorrespondingViewFileSuffix());
         }
+        treeView = createView(file, model);
+        visitor = createMetamodelVisitor();
+        for (EObject root : model.getContents()) {
+            visitor.visit(root);
+        }
+        tokens.add(Token.fileEnd(currentFile));
+        treeView.writeToFile(getCorrespondingViewFileSuffix());
     }
 
     /**