Skip to content

ShingleBased.GetProfile is not public #21

@kentcb

Description

@kentcb

Hi,

Just trying this library out and the README gives this example:

string s1 = "My first string";
string s2 = "My other string...";
        
// Let's work with sequences of 2 characters...
var cosine = new Cosine(2);
        
// For cosine similarity I need the profile of strings
StringProfile profile1 = cosine.GetProfile(s1);
StringProfile profile2 = cosine.GetProfile(s2);
        
// Prints 0.516185
Console.WriteLine(profile1.CosineSimilarity(profile2));

However, this doesn't compile because ShingleBased.GetProfile is protected and because StringProfile no longer appears to be a thing (or maybe that's only in the Java code...?).

Anyway, I can subclass Cosine to hackily gain access to GetProfile, but then I can't really do anything meaningful with it because there is no StringProfile.CosineSimilarity available. I ended up having to copy/paste Cosine in its entirety and hacking in a GetProfile and CosineSimilarity method to gain access to this functionality:

public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStringDistance
{
	public Cosine(int k) : base(k) { }

	public Cosine() { }

	public double Similarity(string s1, string s2)
	{
		if (s1 == null)
		{
			throw new ArgumentNullException(nameof(s1));
		}

		if (s2 == null)
		{
			throw new ArgumentNullException(nameof(s2));
		}

		if (s1.Equals(s2))
		{
			return 1;
		}

		if (s1.Length < k || s2.Length < k)
		{
			return 0;
		}

		var profile1 = GetProfile(s1);
		var profile2 = GetProfile(s2);

		return CosineSimilarity(profile1, profile2);
	}
	
	public double CosineSimilarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2) =>
		DotProduct(profile1, profile2) / (Norm(profile1) * Norm(profile2));

	private static double Norm(IDictionary<string, int> profile)
	{
		double agg = 0;

		foreach (var entry in profile)
		{
			agg += 1.0 * entry.Value * entry.Value;
		}

		return Math.Sqrt(agg);
	}

	private static double DotProduct(IDictionary<string, int> profile1,
		IDictionary<string, int> profile2)
	{
		// Loop over the smallest map
		var small_profile = profile2;
		var large_profile = profile1;

		if (profile1.Count < profile2.Count)
		{
			small_profile = profile1;
			large_profile = profile2;
		}

		double agg = 0;
		foreach (var entry in small_profile)
		{
			if (!large_profile.TryGetValue(entry.Key, out var i)) continue;

			agg += 1.0 * entry.Value * i;
		}

		return agg;
	}

	public double Distance(string s1, string s2)
		=> 1.0 - Similarity(s1, s2);

	public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
		=> DotProduct(profile1, profile2)
		/ (Norm(profile1) * Norm(profile2));
		
	public new IDictionary<string, int> GetProfile(string s) =>
		base.GetProfile(s);
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions