diff --git a/README.md b/README.md index 583a9fe..4cb0948 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ print(results) - Cosine - Dice - Jaccard +- Overlap ## Run Tests ``` diff --git a/simstring/measure/overlap.py b/simstring/measure/overlap.py new file mode 100644 index 0000000..d5e4fe4 --- /dev/null +++ b/simstring/measure/overlap.py @@ -0,0 +1,16 @@ +from simstring.measure.base import BaseMeasure +from sys import maxsize +import math + +class OverlapMeasure(BaseMeasure): + def min_feature_size(self, query_size, alpha): + return 1 + + def max_feature_size(self, query_size, alpha): + return maxsize + + def minimum_common_feature_count(self, query_size, y_size, alpha): + return int(math.ceil(alpha * min(query_size, y_size))) + + def similarity(self, X, Y): + return min(len(set(X)), len(set(Y))) diff --git a/simstring/searcher.py b/simstring/searcher.py index 8365706..98ace28 100644 --- a/simstring/searcher.py +++ b/simstring/searcher.py @@ -14,6 +14,8 @@ def search(self, query_string, alpha): features = self.feature_extractor.features(query_string) min_feature_size = self.measure.min_feature_size(len(features), alpha) max_feature_size = self.measure.max_feature_size(len(features), alpha) + if hasattr(self.db, 'max_feature_size'): + max_feature_size = min(max_feature_size, self.db.max_feature_size()) results = [] for candidate_feature_size in range(min_feature_size, max_feature_size + 1): diff --git a/tests/measure/test_overlap.py b/tests/measure/test_overlap.py new file mode 100644 index 0000000..4339fdb --- /dev/null +++ b/tests/measure/test_overlap.py @@ -0,0 +1,30 @@ +# -*- coding:utf-8 -*- + +from unittest import TestCase +from simstring.measure.overlap import OverlapMeasure +from sys import maxsize + +class TestOverlap(TestCase): + measure = OverlapMeasure() + + def test_min_feature_size(self): + self.assertEqual(self.measure.min_feature_size(5, 1.0), 1) + self.assertEqual(self.measure.min_feature_size(5, 0.5), 1) + + def test_max_feature_size(self): + self.assertEqual(self.measure.max_feature_size(5, 1.0), maxsize) + self.assertEqual(self.measure.max_feature_size(5, 0.5), maxsize) + + def test_minimum_common_feature_count(self): + self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5) + self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 5) + self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3) + + def test_similarity(self): + x = [1, 2, 3] + y = [1, 2, 3, 4] + self.assertEqual(round(self.measure.similarity(x, x), 2), 3) + self.assertEqual(round(self.measure.similarity(x, y), 2), 3) + + z = [1, 1, 2, 3] + self.assertEqual(round(self.measure.similarity(z, z), 2), 3)