Skip to content

Commit

Permalink
Merge pull request #1545 from jplag/feature/emf-mode-fixes
Browse files Browse the repository at this point in the history
Improvement of the EMF metamodel and model languages
  • Loading branch information
tsaglam authored Feb 16, 2024
2 parents d246d6d + c41cd89 commit 156b1da
Show file tree
Hide file tree
Showing 28 changed files with 452 additions and 254 deletions.
4 changes: 3 additions & 1 deletion core/src/main/java/de/jplag/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ private static File createErrorDirectory(String... subdirectoryNames) {

/**
* Parse files of the submission.
* @param debugParser specifies if the submission should be copied upon parsing errors.
* @param normalize specifies if the tokens sequences should be normalized.
* @return Whether parsing was successful.
*/
/* package-private */ boolean parse(boolean debugParser, boolean normalize) {
Expand Down Expand Up @@ -276,7 +278,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
}

/**
* Perform token string normalization, which makes the token string invariant to dead code insertion and independent
* Perform token sequence normalization, which makes the token sequence invariant to dead code insertion and independent
* statement reordering.
*/
void normalize() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
package de.jplag.exceptions;

import java.io.Serial;

/**
* Exceptions used if configuration is wrong.
*/
public class ConfigurationException extends ExitException {
@Serial
private static final long serialVersionUID = 4625302641982932127L; // generated

public ConfigurationException(String message) {
super(message);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,19 @@
import de.jplag.Token;

/**
* Performs token string normalization.
* Performs token sequence normalization.
*/
public class TokenStringNormalizer {

private TokenStringNormalizer() {
}

/**
* Performs token string normalization. Tokens representing dead code have been eliminated and tokens representing
* Performs token sequence normalization. Tokens representing dead code have been eliminated and tokens representing
* subsequent independent statements have been put in a fixed order. Works by first constructing a Normalization Graph
* and then turning it back into a token string.
* @param tokens The original token string, remains unaltered.
* @return The normalized token string.
* and then turning it back into a token sequence.
* @param tokens The original token sequence, remains unaltered.
* @return The normalized token sequence.
*/
public static List<Token> normalize(List<Token> tokens) {
SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph = new NormalizationGraphConstructor(tokens).get();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package de.jplag.reporting.jsonfactory.serializer;

import java.io.IOException;
import java.io.Serial;

import de.jplag.Language;

Expand All @@ -10,6 +11,9 @@

public class LanguageSerializer extends StdSerializer<Language> {

@Serial
private static final long serialVersionUID = 5944655736767387268L; // generated

/**
* Constructor used by the fasterxml.jackson
*/
Expand Down
3 changes: 2 additions & 1 deletion language-api/src/main/java/de/jplag/Language.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ default List<File> customizeSubmissionOrder(List<File> submissions) {
}

/**
* @return True, if tokens for this language can be normalized
* @return True, if this language supports token sequence normalization. This does not include other normalization
* mechanisms that might be part of the language modules.
*/
default boolean supportsNormalization() {
return false;
Expand Down
16 changes: 11 additions & 5 deletions languages/emf-metamodel-dynamic/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Dynamic EMF metamodel language module
The dynamic EMF metamodel language module allows the use of JPlag with metamodel submissions.
The dynamic EMF metamodel language module allows the use of JPlag with EMF metamodel submissions.
It is based on the EMF API.

### EMF specification compatibility
Expand All @@ -9,8 +9,14 @@ This module is based on the EMF dependencies available on maven central. These m
For the token extraction, we visit the containment tree of the metamodel and extract tokens for all metamodel elements based on their concrete metaclass. In this module, we thus extract tokens based on a dynamic token set.

### Usage
To use this module, add the `-l emf-metamodel-dynamic` flag in the CLI, or use a `JPlagOption` object with `new DynamicEmfLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).
Note that this language module is currently not offered via the CLI.
Use the non-dymamic version instead (`-l emf`).

### More Info
More information can be found in the paper [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
A short summary can be found on [Kudos](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
### Report Viewer
In the report viewer, Emfatic is used as a textual model view.

### Literature
* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
* Its [Kudos Summary](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*]
* *"Automated Detection of AI-Obfuscated Plagiarism in Modeling Assignments" (ICSE-SEET'24)*
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.emf.dynamic.parser;

import java.util.HashSet;
import java.util.Set;
import java.util.LinkedHashSet;
import java.util.SequencedSet;

import org.eclipse.emf.ecore.EClass;
import org.eclipse.emf.ecore.EObject;
Expand All @@ -15,14 +15,7 @@
*/
public class DynamicElementTokenizer implements ModelingElementTokenizer {

private final Set<TokenType> knownTokenTypes;

/**
* Creates the tokenizer, initially with an empty token set.
*/
public DynamicElementTokenizer() {
knownTokenTypes = new HashSet<>();
}
private static final SequencedSet<TokenType> knownTokenTypes = new LinkedHashSet<>();

@Override
public TokenType element2Token(EObject modelElement) {
Expand All @@ -32,7 +25,7 @@ public TokenType element2Token(EObject modelElement) {
}

@Override
public Set<TokenType> allTokenTypes() {
return Set.copyOf(knownTokenTypes);
public SequencedSet<TokenType> allTokenTypes() {
return new LinkedHashSet<>(knownTokenTypes);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public void setUp() {
@DisplayName("Test tokens generated from example metamodels")
void testBookstoreMetamodels() throws ParsingException {
List<File> testFiles = Arrays.stream(TEST_SUBJECTS).map(path -> new File(BASE_PATH.toFile(), path)).toList();
List<Token> result = language.parse(new HashSet<>(testFiles));
List<Token> result = language.parse(new HashSet<>(testFiles), true);
List<TokenType> tokenTypes = result.stream().map(Token::getType).toList();
logger.debug(TokenPrinter.printTokens(result, baseDirectory, Optional.of(EmfLanguage.VIEW_FILE_SUFFIX)));
logger.info("parsed token types: " + tokenTypes.stream().map(TokenType::getDescription).toList());
Expand Down
16 changes: 11 additions & 5 deletions languages/emf-metamodel/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# EMF metamodel language module
The EMF metamodel language module allows the use of JPlag with metamodel submissions.
The EMF metamodel language module allows the use of JPlag with EMF metamodel submissions.
It is based on the EMF API.

### EMF specification compatibility
Expand All @@ -9,8 +9,14 @@ This module is based on the EMF dependencies available on maven central. These m
For the token extraction, we visit the containment tree of the metamodel and extract tokens for certain metamodel elements based on their metaclass. In this module, we extract tokens based on a [handcrafted token set](https://github.com/jplag/JPlag/blob/master/languages/emf-metamodel/src/main/java/de/jplag/emf/MetamodelTokenType.java). Note that not for all concrete metaclasses tokens are extracted. `EFactory`, `EGenericType`, and `EObject` are ignored. Moreover, for some metaclasses, multiple token types are extracted. Finally, some references are also used for token extraction.

### Usage
To use this module, add the `-l emf-metamodel` flag in the CLI, or use a `JPlagOption` object with `new EmfLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).
The input for this module is a set of EMF metamodels (`.ecore` files).
To use this module, add the `-l emf` flag in the CLI, or use a `JPlagOption` object with `new EmfLanguage()` as `language` in the Java API as described in the usage information in the [readme of the main project](https://github.com/jplag/JPlag#usage) and [in the wiki](https://github.com/jplag/JPlag/wiki/1.-How-to-Use-JPlag).

### More Info
More information can be found in the paper [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
A short summary can be found on [Kudos](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
### Report Viewer
In the report viewer, Emfatic is used as a textual model view.

### Literature
* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*](https://dl.acm.org/doi/10.1145/3550356.3556508).
* Its [Kudos Summary](https://www.growkudos.com/publications/10.1145%25252F3550356.3556508/reader).
* [*"Token-based Plagiarism Detection for Metamodels" (MODELS-C'22)*]
* *"Automated Detection of AI-Obfuscated Plagiarism in Modeling Assignments" (ICSE-SEET'24)*
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public int minimumTokenMatch() {

@Override
public List<Token> parse(Set<File> files, boolean normalize) throws ParsingException {
return parser.parse(files);
return parser.parse(files, normalize);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ public int compare(EObject first, EObject second) {
// 0. comparison if token types are absent for one or more elements.
if (firstType == null && secondType == null) {
return 0;
} else if (firstType == null) {
}
if (firstType == null) {
return -1;
} else if (secondType == null) {
}
if (secondType == null) {
return 1;
}

Expand Down Expand Up @@ -91,7 +93,7 @@ private List<EObject> calculatePath(TokenType type) {
List<EObject> elements = modelElementsToSort.stream().filter(it -> type.equals(tokenizer.element2Token(it))).toList();

// Generate token type distributions for the subtrees of the elements to sort:
Map<EObject, List<Double>> subtreeVectors = new HashMap<>();
Map<EObject, TokenOccurenceVector> subtreeVectors = new HashMap<>();
elements.forEach(it -> subtreeVectors.put(it, tokenVectorGenerator.generateOccurenceVector(it.eAllContents())));

// Calculate distance matrix:
Expand All @@ -118,15 +120,17 @@ private int countSubtreeTokens(EObject modelElement) {
return count;
}

private static double euclideanDistance(List<Double> first, List<Double> second) {
if (first.size() != second.size()) {
throw new IllegalArgumentException("Lists must have the same size");
}
/**
* Calculates the euclidean distance for two token occurrence vectors. As they are zero-padded, they are virtually of
* the same length.
*/
private static double euclideanDistance(TokenOccurenceVector first, TokenOccurenceVector second) {
double sum = 0;
for (int i = 0; i < first.size(); i++) {
double diff = first.get(i) - second.get(i);
sum += diff * diff;
}
return Math.sqrt(sum);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package de.jplag.emf.normalization;

import java.util.List;

/**
* A vector for the occurrence frequency of different token types. The vector is padded with zeroes beyond its original
* size. The vector content cannot be changed after its creation.
*/
public class TokenOccurenceVector {
private final List<Double> originalVector;

/**
* Creates a zero-padded token occurrence vector.
* @param originalVector specifies the occurrence frequency values for the vector.
*/
public TokenOccurenceVector(List<Double> originalVector) {
this.originalVector = originalVector;
}

/**
* Return a occurrence frequency value of the vector at the specified.
* @param index is the specified index.
* @return the occurrence frequency value or zero if the index is beyond the size of the vector.
*/
public double get(int index) {
if (index >= originalVector.size()) {
return 0.0;
}
return originalVector.get(index);
}

/**
* The original size of the vector, without padding.
* @return the size.
*/
public int size() {
return originalVector.size();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import org.eclipse.emf.ecore.EObject;

import de.jplag.TokenType;
import de.jplag.emf.MetamodelTokenType;
import de.jplag.emf.parser.ModelingElementTokenizer;

/**
Expand All @@ -27,10 +26,10 @@ public TokenVectorGenerator(ModelingElementTokenizer tokenizer) {
/**
* Generate a token occurrence vector for a subtree of a model.
* @param modelElements is a visitor for the subtree.
* @return a list, where each entry represents the number of tokens in the subtree. The order is determined by
* {@link MetamodelTokenType}.
* @return a zero padded token occurrence vector, where each entry represents the number of tokens in the subtree. The
* order is determined by {@link ModelingElementTokenizer#allTokenTypes()}.
*/
public List<Double> generateOccurenceVector(Iterator<EObject> modelElements) {
public TokenOccurenceVector generateOccurenceVector(Iterator<EObject> modelElements) {
Map<TokenType, Integer> tokenTypeHistogram = new HashMap<>();

while (modelElements.hasNext()) {
Expand All @@ -40,7 +39,7 @@ public List<Double> generateOccurenceVector(Iterator<EObject> modelElements) {
for (TokenType type : tokenizer.allTokenTypes()) {
occurenceVector.add(tokenTypeHistogram.getOrDefault(type, 0));
}
return normalize(occurenceVector);
return new TokenOccurenceVector(normalize(occurenceVector));
}

public static List<Double> normalize(List<Integer> vector) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ public EcoreParser() {
* @param files is the set of files.
* @return the list of parsed tokens.
*/
public List<Token> parse(Set<File> files) throws ParsingException {
public List<Token> parse(Set<File> files, boolean normalize) throws ParsingException {
tokens = new ArrayList<>();
for (File file : files) {
parseModelFile(file);
parseModelFile(file, normalize);
}
return tokens;
}
Expand All @@ -53,21 +53,22 @@ public List<Token> parse(Set<File> files) throws ParsingException {
* Loads a metamodel from a file and parses it.
* @param file is the metamodel file.
*/
protected void parseModelFile(File file) throws ParsingException {
protected void parseModelFile(File file, boolean normalize) throws ParsingException {
currentFile = file;
Resource model = EMFUtil.loadModelResource(file);
if (model == null) {
throw new ParsingException(file, "failed to load model");
} else {
}
if (normalize) {
normalizeOrder(model);
treeView = createView(file, model);
visitor = createMetamodelVisitor();
for (EObject root : model.getContents()) {
visitor.visit(root);
}
tokens.add(Token.fileEnd(currentFile));
treeView.writeToFile(getCorrespondingViewFileSuffix());
}
treeView = createView(file, model);
visitor = createMetamodelVisitor();
for (EObject root : model.getContents()) {
visitor.visit(root);
}
tokens.add(Token.fileEnd(currentFile));
treeView.writeToFile(getCorrespondingViewFileSuffix());
}

/**
Expand Down
Loading

0 comments on commit 156b1da

Please sign in to comment.