Skip to content

Commit

Permalink
General cleanup (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
AHeise authored Feb 28, 2019
1 parent 5af276b commit 8a9a8ae
Show file tree
Hide file tree
Showing 71 changed files with 917 additions and 720 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2018 bakdata
Copyright (c) 2019 bakdata

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
21 changes: 10 additions & 11 deletions build.gradle.kts
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
plugins {
`java-library`
// release
id("net.researchgate.release") version "2.6.0"
id("com.bakdata.sonar") version "1.0.1"
id("com.bakdata.sonatype") version "1.0.1"
id("com.bakdata.sonar") version "1.1.2"
id("com.bakdata.sonatype") version "1.1.2"
id("org.hildan.github.changelog") version "0.8.0"
}

Expand Down Expand Up @@ -46,13 +45,13 @@ subprojects {
}

dependencies {
testImplementation("org.junit.jupiter:junit-jupiter-api:5.3.0")
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.3.0")
testImplementation(group= "org.assertj", name= "assertj-core", version= "3.11.1")

compileOnly("org.projectlombok:lombok:1.18.4")
annotationProcessor("org.projectlombok:lombok:1.18.4")
testCompileOnly("org.projectlombok:lombok:1.18.4")
testAnnotationProcessor("org.projectlombok:lombok:1.18.4")
"testImplementation"("org.junit.jupiter:junit-jupiter-api:5.3.0")
"testRuntimeOnly"("org.junit.jupiter:junit-jupiter-engine:5.3.0")
"testImplementation"(group = "org.assertj", name = "assertj-core", version = "3.11.1")

"compileOnly"("org.projectlombok:lombok:1.18.6")
"annotationProcessor"("org.projectlombok:lombok:1.18.6")
"testCompileOnly"("org.projectlombok:lombok:1.18.6")
"testAnnotationProcessor"("org.projectlombok:lombok:1.18.6")
}
}
2 changes: 1 addition & 1 deletion common/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ dependencies {
"api"(group = "commons-codec", name = "commons-codec", version = "1.11")

implementation(group = "com.google.guava", name = "guava", version = "26.0-jre")
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License
* MIT License
*
* Copyright (c) 2018 bakdata GmbH
* Copyright (c) 2019 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -20,7 +20,6 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.candidate_selection;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License
* MIT License
*
* Copyright (c) 2018 bakdata GmbH
* Copyright (c) 2019 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -20,7 +20,6 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.candidate_selection;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License
* MIT License
*
* Copyright (c) 2018 bakdata GmbH
* Copyright (c) 2019 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -20,7 +20,6 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.candidate_selection.online;

Expand All @@ -46,7 +45,8 @@ public class OnlineSortedNeighborhoodMethod<T> implements OnlineCandidateSelecti
int defaultWindowSize = 10;

public List<Candidate<T>> getCandidates(final T newRecord) {
return this.passes.stream().flatMap(pass -> pass.getCandidates(newRecord).stream()).distinct().collect(Collectors.toList());
return this.passes.stream().flatMap(pass -> pass.getCandidates(newRecord).stream()).distinct()
.collect(Collectors.toList());
}

@Value
Expand All @@ -58,14 +58,14 @@ public static class Pass<T> {

List<Candidate<T>> getCandidates(final T newRecord) {
final Comparable<?> newKey = this.sortingKey.getKeyExtractor().apply(newRecord);
if(newKey == null) {
if (newKey == null) {
return List.of();
}
final Stream<T> largerRecords = this.index.tailMap(newKey).values().stream().flatMap(List::stream).limit(
this.windowSize / 2);
this.windowSize / 2);
final Stream<T> smallerRecords =
this.index.descendingMap().tailMap(newKey).values().stream().flatMap(List::stream).limit(
this.windowSize / 2);
this.index.descendingMap().tailMap(newKey).values().stream().flatMap(List::stream).limit(
this.windowSize / 2);
final List<Candidate<T>> candidates = Stream.concat(smallerRecords, largerRecords)
.map(oldRecord -> new Candidate<>(newRecord, oldRecord))
.collect(Collectors.toList());
Expand All @@ -77,7 +77,8 @@ List<Candidate<T>> getCandidates(final T newRecord) {
@SuppressWarnings({"WeakerAccess", "unused"})
public static class OnlineSortedNeighborhoodMethodBuilder<T> {

public OnlineSortedNeighborhoodMethodBuilder<T> sortingKey(final SortingKey<T> sortingKey, final int windowSize) {
public OnlineSortedNeighborhoodMethodBuilder<T> sortingKey(final SortingKey<T> sortingKey,
final int windowSize) {
return this.pass(new Pass<>(sortingKey, windowSize));
}

Expand All @@ -90,7 +91,7 @@ public OnlineSortedNeighborhoodMethodBuilder<T> sortingKeys(final Collection<Sor
}

public OnlineSortedNeighborhoodMethodBuilder<T> sortingKeys(final Iterable<SortingKey<T>> sortingKeys,
final int windowSize) {
final int windowSize) {
for (final SortingKey<T> sortingKey : sortingKeys) {
this.sortingKey(sortingKey, windowSize);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
/*
* MIT License
*
* Copyright (c) 2019 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package com.bakdata.deduplication.classifier;

import com.bakdata.deduplication.candidate_selection.Candidate;
Expand All @@ -11,9 +35,9 @@
@Value
public class OracleClassifier<T> implements Classifier<T> {
private static final Classification DUPLICATE =
Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build();
Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build();
private static final Classification NON_DUPLICATE =
Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build();
Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build();

@NonNull
Set<Candidate<T>> goldDuplicates;
Expand All @@ -22,13 +46,13 @@ public class OracleClassifier<T> implements Classifier<T> {

private Set<Candidate<T>> calculateSymmetricDuplicates() {
return this.getGoldDuplicates().stream()
.flatMap(duplicate ->
Stream.of(duplicate, new Candidate<>(duplicate.getOldRecord(), duplicate.getNewRecord())))
.collect(Collectors.toSet());
.flatMap(duplicate ->
Stream.of(duplicate, new Candidate<>(duplicate.getOldRecord(), duplicate.getNewRecord())))
.collect(Collectors.toSet());
}

@Override
public Classification classify(final Candidate<T> candidate) {
return this.getSymmetricDuplicates().contains(candidate) ? DUPLICATE : NON_DUPLICATE;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License
* MIT License
*
* Copyright (c) 2018 bakdata GmbH
* Copyright (c) 2019 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -20,7 +20,6 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.classifier;

Expand All @@ -36,16 +35,21 @@
import lombok.Value;

/**
* Successively applies a list of rules to the record and returns the respective {@link Classification} with the following cases:
* Successively applies a list of rules to the record and returns the respective {@link Classification} with the
* following cases:
* <ul>
* <li>If any rule classifies the pair unambiguously as {@link Classification.ClassificationResult#DUPLICATE} or {@link Classification.ClassificationResult#NON_DUPLICATE}, the classification is immediately returned.</li>
* <li>If some rule classifies the pair as {@link Classification.ClassificationResult#POSSIBLE_DUPLICATE}, the remaining rules with be evaluated to see if an unambiguous classification will be reached, in which case that classification is returned. If the results are only ambiguous, the last {@code POSSIBLE_DUPLICATE} classification will be returned.</li>
* <li>If any rule classifies the pair unambiguously as {@link Classification.ClassificationResult#DUPLICATE} or {@link
* Classification.ClassificationResult#NON_DUPLICATE}, the classification is immediately returned.</li>
* <li>If some rule classifies the pair as {@link Classification.ClassificationResult#POSSIBLE_DUPLICATE}, the
* remaining
* rules with be evaluated to see if an unambiguous classification will be reached, in which case that classification is
* returned. If the results are only ambiguous, the last {@code POSSIBLE_DUPLICATE} classification will be
* returned.</li>
* <li>If no rule can be applied, the result is {@link #UNKNOWN}.</li>
* </ul>
* <br>
* The {@code Classification} will contain a description naming the triggered rule and converts the rule score into a confidence score.
*
* @param <T>
* The {@code Classification} will contain a description naming the triggered rule and converts the rule score into a
* confidence score.
*/
@Value
@Builder
Expand Down Expand Up @@ -84,50 +88,56 @@ private SimilarityException createException(final Candidate<T> candidate, final
}

private Optional<Classification> evaluateRule(final Rule<? super T> rule, final Candidate<? extends T> candidate,
final SimilarityContext context) {
return context.safeExecute(() -> rule.evaluate(candidate.getNewRecord(), candidate.getOldRecord(), context)).map(score -> {
if (Float.isNaN(score)) {
return UNKNOWN;
}
if (score <= -0.0f) {
return Classification.builder()
.result(Classification.ClassificationResult.NON_DUPLICATE)
.confidence(-score)
.explanation(rule.getName())
.build();
} else {
return Classification.builder()
.result(Classification.ClassificationResult.DUPLICATE)
.confidence(score)
.explanation(rule.getName())
.build();
}
});
final SimilarityContext context) {
return context.safeExecute(() -> rule.evaluate(candidate.getNewRecord(), candidate.getOldRecord(), context))
.map(score -> {
if (Float.isNaN(score)) {
return UNKNOWN;
}
if (score <= -0.0f) {
return Classification.builder()
.result(Classification.ClassificationResult.NON_DUPLICATE)
.confidence(-score)
.explanation(rule.getName())
.build();
} else {
return Classification.builder()
.result(Classification.ClassificationResult.DUPLICATE)
.confidence(score)
.explanation(rule.getName())
.build();
}
});
}

@SuppressWarnings({"WeakerAccess", "UnusedReturnValue"})
public static class RuleBasedClassifierBuilder<T> {

public RuleBasedClassifierBuilder<T> positiveRule(final String name, final BiPredicate<T, T> applicablePredicate,
final SimilarityMeasure<T> similarityMeasure) {
public RuleBasedClassifierBuilder<T> positiveRule(final String name,
final BiPredicate<T, T> applicablePredicate,
final SimilarityMeasure<T> similarityMeasure) {
return this.positiveRule(name, (left, right, context) ->
applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) : DOES_NOT_APPLY);
applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context)
: DOES_NOT_APPLY);
}

public RuleBasedClassifierBuilder<T> positiveRule(final String name, final SimilarityMeasure<T> similarityMeasure) {
public RuleBasedClassifierBuilder<T> positiveRule(final String name,
final SimilarityMeasure<T> similarityMeasure) {
return this.rule(new Rule<>(name, similarityMeasure.unknownIf(s -> s <= 0)));
}

public RuleBasedClassifierBuilder<T> negativeRule(final String name, final BiPredicate<T, T> applicablePredicate,
final SimilarityMeasure<T> similarityMeasure) {
public RuleBasedClassifierBuilder<T> negativeRule(final String name,
final BiPredicate<T, T> applicablePredicate,
final SimilarityMeasure<T> similarityMeasure) {
return this.negativeRule(name, (left, right, context) ->
applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) : DOES_NOT_APPLY);
applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context)
: DOES_NOT_APPLY);
}

public RuleBasedClassifierBuilder<T> negativeRule(final String name,
final SimilarityMeasure<? super T> similarityMeasure) {
final SimilarityMeasure<? super T> similarityMeasure) {
final SimilarityMeasure<T> negativeSim =
(left, right, context) -> -similarityMeasure.getSimilarity(left, right, context);
(left, right, context) -> -similarityMeasure.getSimilarity(left, right, context);
return this.rule(new Rule<>(name, negativeSim.unknownIf(s -> s >= 0)));
}

Expand All @@ -150,4 +160,4 @@ float evaluate(final T left, final T right, final SimilarityContext context) {
return this.measure.getSimilarity(left, right, context);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* The MIT License
* MIT License
*
* Copyright (c) 2018 bakdata GmbH
* Copyright (c) 2019 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand All @@ -20,7 +20,6 @@
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.clustering;

Expand All @@ -38,21 +37,23 @@
import lombok.Value;

/**
* Wraps another clustering and keeps clusters together, when the wrapped clustering would split it.<br>
* Example: consider a stable marriage-based clustering where A1-B have been previously matched and subsequently clustered.
* If a strong A2-B would replace that pair and thus split the cluster, this consistent clustering returns a cluster [A1, A2, B] instead.<br>
* Wraps another clustering and keeps clusters together, when the wrapped clustering would split it.<br> Example:
* consider a stable marriage-based clustering where A1-B have been previously matched and subsequently clustered. If a
* strong A2-B would replace that pair and thus split the cluster, this consistent clustering returns a cluster [A1, A2,
* B] instead.<br>
* <p>
* This clustering is similar to {@link TransitiveClosure} but allows the wrapped clustering to split temporary (=not-returned) clusters. Thus, in the example above, we have the following two situations:
* - If A1-B and A2-B would be passed in the same invocation of {@link #cluster(List)}, only cluster [A2, B] would be returned.
* - If A-B is passed in a first invocation, this invocation returns [A1, B]. The following invocation with A2-B would then return [A1, A2, B].
* This clustering is similar to {@link TransitiveClosure} but allows the wrapped clustering to split temporary
* (=not-returned) clusters. Thus, in the example above, we have the following two situations: - If A1-B and A2-B would
* be passed in the same invocation of {@link #cluster(List)}, only cluster [A2, B] would be returned. - If A-B is
* passed in a first invocation, this invocation returns [A1, B]. The following invocation with A2-B would then return
* [A1, A2, B].
* </p>
* It thus trades off clustering accuracy to increase reliability of subsequent data processing.
*
* @param <T>
*/
@Value
@Builder
public class ConsistentClustering<C extends Comparable<C>, T, I extends Comparable<? super I>> implements Clustering<C, T> {
public class ConsistentClustering<C extends Comparable<C>, T, I extends Comparable<? super I>>
implements Clustering<C, T> {
@NonNull
Clustering<C, T> clustering;
Function<T, I> idExtractor;
Expand Down
Loading

0 comments on commit 8a9a8ae

Please sign in to comment.