Skip to content

Add overlap measure #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ print(results)
- Cosine
- Dice
- Jaccard
- Overlap

## Run Tests
```
Expand Down
16 changes: 16 additions & 0 deletions simstring/measure/overlap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from simstring.measure.base import BaseMeasure
from sys import maxsize
import math

class OverlapMeasure(BaseMeasure):
def min_feature_size(self, query_size, alpha):
return 1

def max_feature_size(self, query_size, alpha):
return maxsize

def minimum_common_feature_count(self, query_size, y_size, alpha):
return int(math.ceil(alpha * min(query_size, y_size)))

def similarity(self, X, Y):
return min(len(set(X)), len(set(Y)))
2 changes: 2 additions & 0 deletions simstring/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def search(self, query_string, alpha):
features = self.feature_extractor.features(query_string)
min_feature_size = self.measure.min_feature_size(len(features), alpha)
max_feature_size = self.measure.max_feature_size(len(features), alpha)
if hasattr(self.db, 'max_feature_size'):
max_feature_size = min(max_feature_size, self.db.max_feature_size())
results = []

for candidate_feature_size in range(min_feature_size, max_feature_size + 1):
Expand Down
30 changes: 30 additions & 0 deletions tests/measure/test_overlap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding:utf-8 -*-

from unittest import TestCase
from simstring.measure.overlap import OverlapMeasure
from sys import maxsize

class TestOverlap(TestCase):
measure = OverlapMeasure()

def test_min_feature_size(self):
self.assertEqual(self.measure.min_feature_size(5, 1.0), 1)
self.assertEqual(self.measure.min_feature_size(5, 0.5), 1)

def test_max_feature_size(self):
self.assertEqual(self.measure.max_feature_size(5, 1.0), maxsize)
self.assertEqual(self.measure.max_feature_size(5, 0.5), maxsize)

def test_minimum_common_feature_count(self):
self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5)
self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 5)
self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3)

def test_similarity(self):
x = [1, 2, 3]
y = [1, 2, 3, 4]
self.assertEqual(round(self.measure.similarity(x, x), 2), 3)
self.assertEqual(round(self.measure.similarity(x, y), 2), 3)

z = [1, 1, 2, 3]
self.assertEqual(round(self.measure.similarity(z, z), 2), 3)