Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/normalization option #1479

Merged
merged 18 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/src/main/java/de/jplag/cli/CLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ public JPlagOptions buildOptionsFromArguments(ParseResult parseResult) throws Cl
JPlagOptions jPlagOptions = new JPlagOptions(loadLanguage(parseResult), this.options.minTokenMatch, submissionDirectories,
oldSubmissionDirectories, null, this.options.advanced.subdirectory, suffixes, this.options.advanced.exclusionFileName,
JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.options.advanced.similarityThreshold, this.options.shownComparisons, clusteringOptions,
this.options.advanced.debug, mergingOptions);
this.options.advanced.debug, mergingOptions, this.options.normalize);

String baseCodePath = this.options.baseCode;
File baseCodeDirectory = baseCodePath == null ? null : new File(baseCodePath);
Expand Down
3 changes: 3 additions & 0 deletions cli/src/main/java/de/jplag/cli/CliOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ public class CliOptions implements Runnable {
@ArgGroup(validate = false, heading = "Merging of neighboring matches to increase the similarity of concealed plagiarism:%n")
public Merging merging = new Merging();

@Option(names = {"--normalize"}, description = "Activate the normalization of tokens. Supported for languages: Java, C++.")
public boolean normalize = false;

/**
* Empty run method, so picocli prints help automatically
*/
Expand Down
7 changes: 7 additions & 0 deletions core/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ public static JPlagResult run(JPlagOptions options) throws ExitException {
// Parse and validate submissions.
SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
SubmissionSet submissionSet = builder.buildSubmissionSet();
if (options.normalize() && options.language().supportsNormalization() && options.language().requiresCoreNormalization()) {
submissionSet.normalizeSubmissions();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good for now, when we enable it for EMF, we need a language method that can be called for the normalization.

}
int submissionCount = submissionSet.numberOfSubmissions();
if (submissionCount < 2)
throw new SubmissionException("Not enough valid submissions! (found " + submissionCount + " valid submissions)");
Expand Down Expand Up @@ -103,6 +106,10 @@ private static void logSkippedSubmissions(SubmissionSet submissionSet, JPlagOpti
}

private static void checkForConfigurationConsistency(JPlagOptions options) throws RootDirectoryException {
if (options.normalize() && !options.language().supportsNormalization()) {
logger.error(String.format("The language %s cannot be used with normalization.", options.language().getName()));
}

List<String> duplicateNames = getDuplicateSubmissionFolderNames(options);
if (duplicateNames.size() > 0) {
throw new RootDirectoryException(String.format("Duplicate root directory names found: %s", String.join(", ", duplicateNames)));
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/de/jplag/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
* Parse files of the submission.
* @return Whether parsing was successful.
*/
/* package-private */ boolean parse(boolean debugParser) {
/* package-private */ boolean parse(boolean debugParser, boolean normalize) {
if (files == null || files.isEmpty()) {
logger.error("ERROR: nothing to parse for submission \"{}\"", name);
tokenList = null;
Expand All @@ -246,7 +246,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
}

try {
tokenList = language.parse(new HashSet<>(files));
tokenList = language.parse(new HashSet<>(files), normalize);
if (logger.isDebugEnabled()) {
for (Token token : tokenList) {
logger.debug(String.join(" | ", token.getType().toString(), Integer.toString(token.getLine()), token.getSemantics().toString()));
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/de/jplag/SubmissionSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ private void parseAllSubmissions() throws ExitException {
private void parseBaseCodeSubmission(Submission baseCode) throws BasecodeException {
long startTime = System.currentTimeMillis();
logger.trace("----- Parsing basecode submission: " + baseCode.getName());
if (!baseCode.parse(options.debugParser())) {
if (!baseCode.parse(options.debugParser(), options.normalize())) {
throw new BasecodeException("Could not successfully parse basecode submission!");
} else if (baseCode.getNumberOfTokens() < options.minimumTokenMatch()) {
throw new BasecodeException(String.format("Basecode submission contains %d token(s), which is less than the minimum match length (%d)!",
Expand Down Expand Up @@ -150,7 +150,7 @@ private void parseSubmissions(List<Submission> submissions) {
logger.trace("------ Parsing submission: " + submission.getName());
currentSubmissionName = submission.getName();

if (!(ok = submission.parse(options.debugParser()))) {
if (!(ok = submission.parse(options.debugParser(), options.normalize()))) {
errors++;
}

Expand Down
10 changes: 10 additions & 0 deletions core/src/main/java/de/jplag/exceptions/ConfigurationException.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package de.jplag.exceptions;

/**
* Exceptions used if configuration is wrong.
*/
public class ConfigurationException extends ExitException {
public ConfigurationException(String message) {
super(message);
}
}
44 changes: 26 additions & 18 deletions core/src/main/java/de/jplag/options/JPlagOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ public record JPlagOptions(@JsonSerialize(using = LanguageSerializer.class) Lang
@JsonProperty("subdirectory_name") String subdirectoryName, @JsonProperty("file_suffixes") List<String> fileSuffixes,
@JsonProperty("exclusion_file_name") String exclusionFileName, @JsonProperty("similarity_metric") SimilarityMetric similarityMetric,
@JsonProperty("similarity_threshold") double similarityThreshold, @JsonProperty("max_comparisons") int maximumNumberOfComparisons,
@JsonProperty("cluster") ClusteringOptions clusteringOptions, boolean debugParser, @JsonProperty("merging") MergingOptions mergingOptions) {
@JsonProperty("cluster") ClusteringOptions clusteringOptions, boolean debugParser, @JsonProperty("merging") MergingOptions mergingOptions,
@JsonProperty("normalize") boolean normalize) {

public static final double DEFAULT_SIMILARITY_THRESHOLD = 0;
public static final int DEFAULT_SHOWN_COMPARISONS = 500;
Expand All @@ -68,13 +69,13 @@ public record JPlagOptions(@JsonSerialize(using = LanguageSerializer.class) Lang

public JPlagOptions(Language language, Set<File> submissionDirectories, Set<File> oldSubmissionDirectories) {
this(language, null, submissionDirectories, oldSubmissionDirectories, null, null, null, null, DEFAULT_SIMILARITY_METRIC,
DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false, new MergingOptions());
DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SHOWN_COMPARISONS, new ClusteringOptions(), false, new MergingOptions(), false);
}

public JPlagOptions(Language language, Integer minimumTokenMatch, Set<File> submissionDirectories, Set<File> oldSubmissionDirectories,
File baseCodeSubmissionDirectory, String subdirectoryName, List<String> fileSuffixes, String exclusionFileName,
SimilarityMetric similarityMetric, double similarityThreshold, int maximumNumberOfComparisons, ClusteringOptions clusteringOptions,
boolean debugParser, MergingOptions mergingOptions) {
boolean debugParser, MergingOptions mergingOptions, boolean normalize) {
this.language = language;
this.debugParser = debugParser;
this.fileSuffixes = fileSuffixes == null || fileSuffixes.isEmpty() ? null : Collections.unmodifiableList(fileSuffixes);
Expand All @@ -89,90 +90,97 @@ public JPlagOptions(Language language, Integer minimumTokenMatch, Set<File> subm
this.subdirectoryName = subdirectoryName;
this.clusteringOptions = clusteringOptions;
this.mergingOptions = mergingOptions;
this.normalize = normalize;
}

public JPlagOptions withLanguageOption(Language language) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withDebugParser(boolean debugParser) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withFileSuffixes(List<String> fileSuffixes) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withSimilarityThreshold(double similarityThreshold) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withMaximumNumberOfComparisons(int maximumNumberOfComparisons) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withSimilarityMetric(SimilarityMetric similarityMetric) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withMinimumTokenMatch(Integer minimumTokenMatch) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withExclusionFileName(String exclusionFileName) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withSubmissionDirectories(Set<File> submissionDirectories) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withOldSubmissionDirectories(Set<File> oldSubmissionDirectories) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withBaseCodeSubmissionDirectory(File baseCodeSubmissionDirectory) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withSubdirectoryName(String subdirectoryName) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withClusteringOptions(ClusteringOptions clusteringOptions) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withMergingOptions(MergingOptions mergingOptions) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions);
clusteringOptions, debugParser, mergingOptions, normalize);
}

public JPlagOptions withNormalize(boolean normalize) {
return new JPlagOptions(language, minimumTokenMatch, submissionDirectories, oldSubmissionDirectories, baseCodeSubmissionDirectory,
subdirectoryName, fileSuffixes, exclusionFileName, similarityMetric, similarityThreshold, maximumNumberOfComparisons,
clusteringOptions, debugParser, mergingOptions, normalize);
}

public boolean hasBaseCode() {
Expand Down Expand Up @@ -264,7 +272,7 @@ public JPlagOptions(Language language, Integer minimumTokenMatch, File submissio
boolean debugParser, MergingOptions mergingOptions) throws BasecodeException {
this(language, minimumTokenMatch, Set.of(submissionDirectory), oldSubmissionDirectories,
convertLegacyBaseCodeToFile(baseCodeSubmissionName, submissionDirectory), subdirectoryName, fileSuffixes, exclusionFileName,
similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser, mergingOptions);
similarityMetric, similarityThreshold, maximumNumberOfComparisons, clusteringOptions, debugParser, mergingOptions, false);
}

/**
Expand Down
32 changes: 30 additions & 2 deletions language-api/src/main/java/de/jplag/Language.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,25 @@ public interface Language {
int minimumTokenMatch();

/**
* Parses a set of files.
* Parses a set of files. Override this method, if you don't require normalization.
* @param files are the files to parse.
* @return the list of parsed JPlag tokens.
* @throws ParsingException if an error during parsing the files occurred.
*/
List<Token> parse(Set<File> files) throws ParsingException;
default List<Token> parse(Set<File> files) throws ParsingException {
throw new UnsupportedOperationException("No parse method was implemented for language: " + this.getClass().getSimpleName());
}

/**
* Parses a set of files. Override this method, if you require normalization within the language module.
* @param files are the files to parse.
* @param normalize True, if the tokens should be normalized
* @return the list of parsed JPlag tokens.
* @throws ParsingException if an error during parsing the files occurred.
*/
default List<Token> parse(Set<File> files, boolean normalize) throws ParsingException {
return parse(files);
}

/**
* Indicates whether the tokens returned by parse have semantic information added to them, i.e. whether the token
Expand Down Expand Up @@ -93,4 +106,19 @@ default boolean expectsSubmissionOrder() {
default List<File> customizeSubmissionOrder(List<File> submissions) {
return submissions;
}

/**
* @return True, if tokens for this language can be normalized
*/
default boolean supportsNormalization() {
return false;
}

/**
* Override this method, if you need normalization within the language module, but not in the core module.
* @return True, If the core normalization should be used.
*/
default boolean requiresCoreNormalization() {
return true;
}
}
70 changes: 70 additions & 0 deletions language-api/src/test/java/de/jplag/LanguageTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package de.jplag;

import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

class LanguageTest {
@Test
public void testInvalidLanguageDoesNotWork() {
Assertions.assertThrows(UnsupportedOperationException.class, () -> {
InvalidLanguage invalidLanguage = new InvalidLanguage();
invalidLanguage.parse(Collections.emptySet(), false);
});
}

@Test
public void testValidLanguageWithNormalization() throws ParsingException {
Language language = new LanguageWithNormalization();
language.parse(Collections.emptySet(), false);
}

@Test
public void testValidLanguageWithoutNormalization() throws ParsingException {
Language language = new LanguageWithoutNormalization();
language.parse(Collections.emptySet(), false);
}

private static abstract class LanguageBase implements Language {
@Override
public String[] suffixes() {
return new String[0];
}

@Override
public String getName() {
return null;
}

@Override
public String getIdentifier() {
return null;
}

@Override
public int minimumTokenMatch() {
return 0;
}
}

private static class InvalidLanguage extends LanguageBase {
}

private static class LanguageWithNormalization extends LanguageBase {
@Override
public List<Token> parse(Set<File> files, boolean normalize) {
return Collections.emptyList();
}
}

private static class LanguageWithoutNormalization extends LanguageBase {
@Override
public List<Token> parse(Set<File> files) {
return Collections.emptyList();
}
}
}
5 changes: 5 additions & 0 deletions languages/cpp2/src/main/java/de/jplag/cpp2/CPPLanguage.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,9 @@ public int minimumTokenMatch() {
public boolean tokensHaveSemantics() {
return true;
}

@Override
public boolean supportsNormalization() {
return true;
}
}
Loading
Loading