Skip to content

Commit

Permalink
use sklearn for cosine distances
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed May 6, 2022
1 parent 8bd2022 commit 3c34d99
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 5 deletions.
26 changes: 24 additions & 2 deletions dedupe/variables/set.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
from .base import FieldType
from dedupe import predicates
from simplecosine.cosine import CosineSetSimilarity

import sklearn.feature_extraction.text
import sklearn.metrics.pairwise


def no_op(x):
return x


class TfidfSetVectorizer(sklearn.feature_extraction.text.TfidfVectorizer):
def build_analyzer(self):
return no_op


class SetType(FieldType):
Expand Down Expand Up @@ -28,4 +39,15 @@ def __init__(self, definition):
if "corpus" not in definition:
definition["corpus"] = []

self.comparator = CosineSetSimilarity(definition["corpus"])
corpus = (doc for doc in definition["corpus"] if doc)

self.vectorizer = TfidfSetVectorizer()
self.vectorizer.fit(corpus)

self._cosine = sklearn.metrics.pairwise.cosine_similarity

def comparator(self, field_1, field_2):

return self._cosine(
self.vectorizer.transform([field_1]), self.vectorizer.transform([field_2])
)
17 changes: 15 additions & 2 deletions dedupe/variables/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

from affinegap import normalizedAffineGapDistance as affineGap
from highered import CRFEditDistance
from simplecosine.cosine import CosineTextSimilarity

import sklearn.feature_extraction.text
import sklearn.metrics.pairwise

from typing import Optional

Expand Down Expand Up @@ -103,4 +105,15 @@ def __init__(self, definition):
if "corpus" not in definition:
definition["corpus"] = []

self.comparator = CosineTextSimilarity(definition["corpus"])
corpus = (doc for doc in definition["corpus"] if doc)

self.vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
self.vectorizer.fit(corpus)

self._cosine = sklearn.metrics.pairwise.cosine_similarity

def comparator(self, field_1, field_2):

return self._cosine(
self.vectorizer.transform([field_1]), self.vectorizer.transform([field_2])
)
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"numpy>=1.13",
"doublemetaphone",
"highered>=0.2.0",
"simplecosine>=1.2",
"haversine>=0.4.1",
"BTrees>=4.1.4",
"zope.index",
Expand Down

0 comments on commit 3c34d99

Please sign in to comment.