-
Notifications
You must be signed in to change notification settings - Fork 72
Closed
Description
Hi,
Just trying this library out and the README gives this example:
string s1 = "My first string";
string s2 = "My other string...";
// Let's work with sequences of 2 characters...
var cosine = new Cosine(2);
// For cosine similarity I need the profile of strings
StringProfile profile1 = cosine.GetProfile(s1);
StringProfile profile2 = cosine.GetProfile(s2);
// Prints 0.516185
Console.WriteLine(profile1.CosineSimilarity(profile2));However, this doesn't compile because ShingleBased.GetProfile is protected and because StringProfile no longer appears to be a thing (or maybe that's only in the Java code...?).
Anyway, I can subclass Cosine to hackily gain access to GetProfile, but then I can't really do anything meaningful with it because there is no StringProfile.CosineSimilarity available. I ended up having to copy/paste Cosine in its entirety and hacking in a GetProfile and CosineSimilarity method to gain access to this functionality:
public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStringDistance
{
public Cosine(int k) : base(k) { }
public Cosine() { }
public double Similarity(string s1, string s2)
{
if (s1 == null)
{
throw new ArgumentNullException(nameof(s1));
}
if (s2 == null)
{
throw new ArgumentNullException(nameof(s2));
}
if (s1.Equals(s2))
{
return 1;
}
if (s1.Length < k || s2.Length < k)
{
return 0;
}
var profile1 = GetProfile(s1);
var profile2 = GetProfile(s2);
return CosineSimilarity(profile1, profile2);
}
public double CosineSimilarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2) =>
DotProduct(profile1, profile2) / (Norm(profile1) * Norm(profile2));
private static double Norm(IDictionary<string, int> profile)
{
double agg = 0;
foreach (var entry in profile)
{
agg += 1.0 * entry.Value * entry.Value;
}
return Math.Sqrt(agg);
}
private static double DotProduct(IDictionary<string, int> profile1,
IDictionary<string, int> profile2)
{
// Loop over the smallest map
var small_profile = profile2;
var large_profile = profile1;
if (profile1.Count < profile2.Count)
{
small_profile = profile1;
large_profile = profile2;
}
double agg = 0;
foreach (var entry in small_profile)
{
if (!large_profile.TryGetValue(entry.Key, out var i)) continue;
agg += 1.0 * entry.Value * i;
}
return agg;
}
public double Distance(string s1, string s2)
=> 1.0 - Similarity(s1, s2);
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
=> DotProduct(profile1, profile2)
/ (Norm(profile1) * Norm(profile2));
public new IDictionary<string, int> GetProfile(string s) =>
base.GetProfile(s);
}Metadata
Metadata
Assignees
Labels
No labels