-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: remove not essential dependencies (#33)
* chore: add owasp dependency-check plugin * fix: add replacement classes for guava and commons-lang * fix: remove guava and commons-lang dependencies * fix: remove commons-lang3 dependency * fix: remove commons-text dependency * fix: fix URIs should not be hardcoded (java:S1075) * refactor: test and algos * fix: remove useless validation after codice generation * test: cover missing instrunctions * chore: gitignore * fix: remove commons-csv dependency * refact: fix technical debt --------- Co-authored-by: Alessio D'Innocenti <a.dinnocenti@jdk.it>
- Loading branch information
1 parent
5419f86
commit d4361fa
Showing
54 changed files
with
1,936 additions
and
327 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
10 changes: 10 additions & 0 deletions
10
src/main/java/it/kamaladafrica/codicefiscale/city/CityStreamSupplier.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package it.kamaladafrica.codicefiscale.city; | ||
|
||
import java.util.function.Supplier; | ||
import java.util.stream.Stream; | ||
|
||
import it.kamaladafrica.codicefiscale.City; | ||
|
||
public interface CityStreamSupplier extends Supplier<Stream<City>> { | ||
|
||
} |
85 changes: 85 additions & 0 deletions
85
src/main/java/it/kamaladafrica/codicefiscale/city/algo/ExactMatchAlgoritm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
package it.kamaladafrica.codicefiscale.city.algo; | ||
|
||
import java.util.Objects; | ||
|
||
/** | ||
* A similarity algorithm that returns 1.0 only if left and right matches, 0.0 | ||
* otherwise. | ||
* | ||
* By default computation ignore the case,so "AAA" and "aaa" are equals | ||
*/ | ||
public class ExactMatchAlgoritm implements ScoreAlgoritm<Double> { | ||
|
||
public static final double MATCH_SCORE = 1.0; | ||
public static final double NO_MATCH_SCORE = 0.0; | ||
|
||
private final boolean ignoreCase; | ||
|
||
public ExactMatchAlgoritm(boolean ignoreCase) { | ||
this.ignoreCase = ignoreCase; | ||
} | ||
|
||
public ExactMatchAlgoritm() { | ||
this(true); | ||
} | ||
|
||
/** | ||
* Computes the Exact Match Similarity between two character sequences. | ||
* | ||
* <pre> | ||
* sim.apply(null, null) = IllegalArgumentException | ||
* sim.apply("foo", null) = IllegalArgumentException | ||
* sim.apply(null, "foo") = IllegalArgumentException | ||
* sim.apply("", "") = 1.0 | ||
* sim.apply("foo", "foo") = 1.0 | ||
* sim.apply("foo", "foo ") = 0.0 | ||
* sim.apply("", "a") = 0.0 | ||
* sim.apply("frog", "fog") = 0.0 | ||
* sim.apply("fly", "ant") = 0.0 | ||
* sim.apply("fly", "FLY") = 1.0 if ignoreCase is true, 0.0 otherwise | ||
* sim.apply("fly", "fLy") = 1.0 if ignoreCase is true, 0.0 otherwise | ||
* </pre> | ||
* | ||
* @param left the first CharSequence, must not be null | ||
* @param right the second CharSequence, must not be null | ||
* @return result similarity | ||
* @throws IllegalArgumentException if either CharSequence input is {@code null} | ||
*/ | ||
@Override | ||
public Double apply(final CharSequence left, final CharSequence right) { | ||
if (left == null || right == null) { | ||
throw new IllegalArgumentException("CharSequences must not be null"); | ||
} | ||
return toScore(areEquals(left, right, ignoreCase)); | ||
} | ||
|
||
private static boolean areEquals(final CharSequence left, final CharSequence right, boolean ignoreCase) { | ||
if (Objects.equals(left, right)) { | ||
return true; | ||
} | ||
|
||
if (left.length() != right.length()) { | ||
return false; | ||
} | ||
|
||
// Step-wise comparison | ||
final int length = left.length(); | ||
for (int i = 0; i < length; i++) { | ||
char lc = left.charAt(i); | ||
char rc = right.charAt(i); | ||
if (ignoreCase && (Character.isLowerCase(lc) != Character.isLowerCase(rc))) { | ||
lc = Character.toLowerCase(lc); | ||
rc = Character.toLowerCase(rc); | ||
} | ||
if (lc != rc) { | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
|
||
private static double toScore(boolean match) { | ||
return match ? MATCH_SCORE : NO_MATCH_SCORE; | ||
} | ||
|
||
} |
148 changes: 148 additions & 0 deletions
148
src/main/java/it/kamaladafrica/codicefiscale/city/algo/JaroWinklerAlgoritm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
package it.kamaladafrica.codicefiscale.city.algo; | ||
|
||
import java.util.Arrays; | ||
|
||
import it.kamaladafrica.codicefiscale.utils.StringUtils; | ||
|
||
/** | ||
* A similarity algorithm indicating the percentage of matched characters | ||
* between two character sequences. | ||
* | ||
* <p> | ||
* The Jaro measure is the weighted sum of percentage of matched characters from | ||
* each file and transposed characters. Winkler increased this measure for | ||
* matching initial characters. | ||
* </p> | ||
* | ||
* <p> | ||
* This implementation is based on the Jaro Winkler similarity algorithm from | ||
* <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"> | ||
* http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>. | ||
* </p> | ||
* | ||
* <p> | ||
* This code has been adapted from Apache Commons Lang 3.3. | ||
* </p> | ||
* | ||
* @since 1.7 | ||
*/ | ||
public class JaroWinklerAlgoritm implements ScoreAlgoritm<Double> { | ||
|
||
/** | ||
* This method returns the Jaro-Winkler string matches, half transpositions, | ||
* prefix array. | ||
* | ||
* @param first the first string to be matched | ||
* @param second the second string to be matched | ||
* @return mtp array containing: matches, half transpositions, and prefix | ||
*/ | ||
protected static int[] matches(final CharSequence first, final CharSequence second) { | ||
final CharSequence max; | ||
final CharSequence min; | ||
if (first.length() > second.length()) { | ||
max = first; | ||
min = second; | ||
} else { | ||
max = second; | ||
min = first; | ||
} | ||
final int range = Math.max(max.length() / 2 - 1, 0); | ||
final int[] matchIndexes = new int[min.length()]; | ||
Arrays.fill(matchIndexes, -1); | ||
final boolean[] matchFlags = new boolean[max.length()]; | ||
int matches = 0; | ||
for (int mi = 0; mi < min.length(); mi++) { | ||
final char c1 = min.charAt(mi); | ||
for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { | ||
if (!matchFlags[xi] && c1 == max.charAt(xi)) { | ||
matchIndexes[mi] = xi; | ||
matchFlags[xi] = true; | ||
matches++; | ||
break; | ||
} | ||
} | ||
} | ||
final char[] ms1 = new char[matches]; | ||
final char[] ms2 = new char[matches]; | ||
for (int i = 0, si = 0; i < min.length(); i++) { | ||
if (matchIndexes[i] != -1) { | ||
ms1[si] = min.charAt(i); | ||
si++; | ||
} | ||
} | ||
for (int i = 0, si = 0; i < max.length(); i++) { | ||
if (matchFlags[i]) { | ||
ms2[si] = max.charAt(i); | ||
si++; | ||
} | ||
} | ||
int halfTranspositions = 0; | ||
for (int mi = 0; mi < ms1.length; mi++) { | ||
if (ms1[mi] != ms2[mi]) { | ||
halfTranspositions++; | ||
} | ||
} | ||
int prefix = 0; | ||
for (int mi = 0; mi < Math.min(4, min.length()); mi++) { | ||
if (first.charAt(mi) != second.charAt(mi)) { | ||
break; | ||
} | ||
prefix++; | ||
} | ||
return new int[] { matches, halfTranspositions, prefix }; | ||
} | ||
|
||
/** | ||
* Computes the Jaro Winkler Similarity between two character sequences. | ||
* | ||
* <pre> | ||
* sim.apply(null, null) = IllegalArgumentException | ||
* sim.apply("foo", null) = IllegalArgumentException | ||
* sim.apply(null, "foo") = IllegalArgumentException | ||
* sim.apply("", "") = 1.0 | ||
* sim.apply("foo", "foo") = 1.0 | ||
* sim.apply("foo", "foo ") = 0.94 | ||
* sim.apply("foo", "foo ") = 0.91 | ||
* sim.apply("foo", " foo ") = 0.87 | ||
* sim.apply("foo", " foo") = 0.51 | ||
* sim.apply("", "a") = 0.0 | ||
* sim.apply("aaapppp", "") = 0.0 | ||
* sim.apply("frog", "fog") = 0.93 | ||
* sim.apply("fly", "ant") = 0.0 | ||
* sim.apply("elephant", "hippo") = 0.44 | ||
* sim.apply("hippo", "elephant") = 0.44 | ||
* sim.apply("hippo", "zzzzzzzz") = 0.0 | ||
* sim.apply("hello", "hallo") = 0.88 | ||
* sim.apply("ABC Corporation", "ABC Corp") = 0.91 | ||
* sim.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95 | ||
* sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92 | ||
* sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88 | ||
* </pre> | ||
* | ||
* @param left the first CharSequence, must not be null | ||
* @param right the second CharSequence, must not be null | ||
* @return result similarity | ||
* @throws IllegalArgumentException if either CharSequence input is {@code null} | ||
*/ | ||
@Override | ||
public Double apply(final CharSequence left, final CharSequence right) { | ||
final double defaultScalingFactor = 0.1; | ||
|
||
if (left == null || right == null) { | ||
throw new IllegalArgumentException("CharSequences must not be null"); | ||
} | ||
|
||
if (StringUtils.equals(left, right)) { | ||
return 1d; | ||
} | ||
|
||
final int[] mtp = matches(left, right); | ||
final double m = mtp[0]; | ||
if (m == 0) { | ||
return 0d; | ||
} | ||
final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3; | ||
return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j); | ||
} | ||
|
||
} |
Oops, something went wrong.