-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
JaroSimilarity.cs
96 lines (82 loc) · 3.01 KB
/
JaroSimilarity.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
using System;
namespace Algorithms.Strings.Similarity;
/// <summary>
/// <para>
/// Jaro Similarity measures how similar two strings are.
/// Result is between 0 and 1 where 0 represnts that there is no similarity between strings and 1 represents equal strings.
/// Time complexity is O(a*b) where a is the length of the first string and b is the length of the second string.
/// </para>
/// <para>
/// Wikipedia: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity.
/// </para>
/// </summary>
public static class JaroSimilarity
{
/// <summary>
/// Calculates Jaro Similarity between two strings.
/// </summary>
/// <param name="s1">First string.</param>
/// <param name="s2">Second string.</param>
public static double Calculate(string s1, string s2)
{
if (s1 == s2)
{
return 1;
}
var longerString = s1.Length > s2.Length ? s1 : s2;
var shorterString = s1.Length < s2.Length ? s1 : s2;
// will look for matching characters in this range
var matchingCharacterRange = Math.Max((longerString.Length / 2) - 1, 0);
var matches = 0d;
// true if i-th index of s1 was matched in s2
var s1MatchedIndeces = new bool[s1.Length];
// true if i-th index of s2 was matched in s1
var s2MatchedIndeces = new bool[s2.Length];
for (var i = 0; i < longerString.Length; i++)
{
var startIndex = Math.Max(i - matchingCharacterRange, 0);
var endIndex = Math.Min(i + matchingCharacterRange, shorterString.Length - 1);
for (var j = startIndex; j <= endIndex; j++)
{
if (s1[i] == s2[j] && !s2MatchedIndeces[j])
{
matches++;
s1MatchedIndeces[i] = true;
s2MatchedIndeces[j] = true;
break;
}
}
}
if (matches == 0)
{
return 0;
}
var transpositions = CalculateTranspositions(s1, s2, s1MatchedIndeces, s2MatchedIndeces);
return ((matches / s1.Length) + (matches / s2.Length) + ((matches - transpositions) / matches)) / 3;
}
/// <summary>
/// Calculates number of matched characters that are not in the right order.
/// </summary>
private static int CalculateTranspositions(string s1, string s2, bool[] s1MatchedIndeces, bool[] s2MatchedIndeces)
{
var transpositions = 0;
var s2Index = 0;
for (var s1Index = 0; s1Index < s1.Length; s1Index++)
{
if (s1MatchedIndeces[s1Index])
{
while (!s2MatchedIndeces[s2Index])
{
s2Index++;
}
if (s1[s1Index] != s2[s2Index])
{
transpositions++;
}
s2Index++;
}
}
transpositions /= 2;
return transpositions;
}
}