+ * sim.apply(null, null) = IllegalArgumentException + * sim.apply("foo", null) = IllegalArgumentException + * sim.apply(null, "foo") = IllegalArgumentException + * sim.apply("", "") = 1.0 + * sim.apply("foo", "foo") = 1.0 + * sim.apply("foo", "foo ") = 0.0 + * sim.apply("", "a") = 0.0 + * sim.apply("frog", "fog") = 0.0 + * sim.apply("fly", "ant") = 0.0 + * sim.apply("fly", "FLY") = 1.0 if ignoreCase is true, 0.0 otherwise + * sim.apply("fly", "fLy") = 1.0 if ignoreCase is true, 0.0 otherwise + *+ * + * @param left the first CharSequence, must not be null + * @param right the second CharSequence, must not be null + * @return result similarity + * @throws IllegalArgumentException if either CharSequence input is {@code null} + */ + @Override + public Double apply(final CharSequence left, final CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("CharSequences must not be null"); + } + return toScore(areEquals(left, right, ignoreCase)); + } + + private static boolean areEquals(final CharSequence left, final CharSequence right, boolean ignoreCase) { + if (Objects.equals(left, right)) { + return true; + } + + if (left.length() != right.length()) { + return false; + } + + // Step-wise comparison + final int length = left.length(); + for (int i = 0; i < length; i++) { + char lc = left.charAt(i); + char rc = right.charAt(i); + if (ignoreCase && (Character.isLowerCase(lc) != Character.isLowerCase(rc))) { + lc = Character.toLowerCase(lc); + rc = Character.toLowerCase(rc); + } + if (lc != rc) { + return false; + } + } + return true; + } + + private static double toScore(boolean match) { + return match ? MATCH_SCORE : NO_MATCH_SCORE; + } + +} diff --git a/src/main/java/it/kamaladafrica/codicefiscale/city/algo/JaroWinklerAlgoritm.java b/src/main/java/it/kamaladafrica/codicefiscale/city/algo/JaroWinklerAlgoritm.java new file mode 100644 index 0000000..33921cc --- /dev/null +++ b/src/main/java/it/kamaladafrica/codicefiscale/city/algo/JaroWinklerAlgoritm.java @@ -0,0 +1,148 @@ +package it.kamaladafrica.codicefiscale.city.algo; + +import java.util.Arrays; + +import it.kamaladafrica.codicefiscale.utils.StringUtils; + +/** + * A similarity algorithm indicating the percentage of matched characters + * between two character sequences. + * + *
+ * The Jaro measure is the weighted sum of percentage of matched characters from + * each file and transposed characters. Winkler increased this measure for + * matching initial characters. + *
+ * + *+ * This implementation is based on the Jaro Winkler similarity algorithm from + * + * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance. + *
+ * + *+ * This code has been adapted from Apache Commons Lang 3.3. + *
+ * + * @since 1.7 + */ +public class JaroWinklerAlgoritm implements ScoreAlgoritm+ * sim.apply(null, null) = IllegalArgumentException + * sim.apply("foo", null) = IllegalArgumentException + * sim.apply(null, "foo") = IllegalArgumentException + * sim.apply("", "") = 1.0 + * sim.apply("foo", "foo") = 1.0 + * sim.apply("foo", "foo ") = 0.94 + * sim.apply("foo", "foo ") = 0.91 + * sim.apply("foo", " foo ") = 0.87 + * sim.apply("foo", " foo") = 0.51 + * sim.apply("", "a") = 0.0 + * sim.apply("aaapppp", "") = 0.0 + * sim.apply("frog", "fog") = 0.93 + * sim.apply("fly", "ant") = 0.0 + * sim.apply("elephant", "hippo") = 0.44 + * sim.apply("hippo", "elephant") = 0.44 + * sim.apply("hippo", "zzzzzzzz") = 0.0 + * sim.apply("hello", "hallo") = 0.88 + * sim.apply("ABC Corporation", "ABC Corp") = 0.91 + * sim.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.95 + * sim.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.92 + * sim.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.88 + *+ * + * @param left the first CharSequence, must not be null + * @param right the second CharSequence, must not be null + * @return result similarity + * @throws IllegalArgumentException if either CharSequence input is {@code null} + */ + @Override + public Double apply(final CharSequence left, final CharSequence right) { + final double defaultScalingFactor = 0.1; + + if (left == null || right == null) { + throw new IllegalArgumentException("CharSequences must not be null"); + } + + if (StringUtils.equals(left, right)) { + return 1d; + } + + final int[] mtp = matches(left, right); + final double m = mtp[0]; + if (m == 0) { + return 0d; + } + final double j = (m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m) / 3; + return j < 0.7d ? j : j + defaultScalingFactor * mtp[2] * (1d - j); + } + +} diff --git a/src/main/java/it/kamaladafrica/codicefiscale/city/algo/ScoreAlgoritm.java b/src/main/java/it/kamaladafrica/codicefiscale/city/algo/ScoreAlgoritm.java new file mode 100644 index 0000000..f038093 --- /dev/null +++ b/src/main/java/it/kamaladafrica/codicefiscale/city/algo/ScoreAlgoritm.java @@ -0,0 +1,47 @@ +package it.kamaladafrica.codicefiscale.city.algo; + +/** + * Interface for the concept of a string similarity score. + * + *
+ * A string similarity score is intended to have some of the properties + * of a metric, yet allowing for exceptions, namely the Jaro-Winkler similarity + * score. + *
+ *+ * We Define a SimilarityScore to be a function + * {@code d: [X * X] -> [0, INFINITY)} with the following properties: + *
+ *+ * Notice, these are two of the properties that contribute to d being a metric. + *
+ * + * + *+ * Further, this intended to be BiFunction<CharSequence, CharSequence, R>. + * The {@code apply} method accepts a pair of {@link CharSequence} parameters + * and returns an {@code R} type similarity score. We have omitted the explicit + * statement of extending BiFunction due to it only being implemented in Java + * 1.8, and we wish to maintain Java 1.7 compatibility. + *
+ * + * @param