From 3c34d996cd3f5459143099741a164ffc955ce529 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 5 May 2022 23:03:32 -0400 Subject: [PATCH] use sklearn for cosine distances --- dedupe/variables/set.py | 26 ++++++++++++++++++++++++-- dedupe/variables/string.py | 17 +++++++++++++++-- setup.py | 1 - 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py index f80a75957..ad23292f1 100644 --- a/dedupe/variables/set.py +++ b/dedupe/variables/set.py @@ -1,6 +1,17 @@ from .base import FieldType from dedupe import predicates -from simplecosine.cosine import CosineSetSimilarity + +import sklearn.feature_extraction.text +import sklearn.metrics.pairwise + + +def no_op(x): + return x + + +class TfidfSetVectorizer(sklearn.feature_extraction.text.TfidfVectorizer): + def build_analyzer(self): + return no_op class SetType(FieldType): @@ -28,4 +39,15 @@ def __init__(self, definition): if "corpus" not in definition: definition["corpus"] = [] - self.comparator = CosineSetSimilarity(definition["corpus"]) + corpus = (doc for doc in definition["corpus"] if doc) + + self.vectorizer = TfidfSetVectorizer() + self.vectorizer.fit(corpus) + + self._cosine = sklearn.metrics.pairwise.cosine_similarity + + def comparator(self, field_1, field_2): + + return self._cosine( + self.vectorizer.transform([field_1]), self.vectorizer.transform([field_2]) + ) diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py index e1499b6f7..96851a369 100644 --- a/dedupe/variables/string.py +++ b/dedupe/variables/string.py @@ -3,7 +3,9 @@ from affinegap import normalizedAffineGapDistance as affineGap from highered import CRFEditDistance -from simplecosine.cosine import CosineTextSimilarity + +import sklearn.feature_extraction.text +import sklearn.metrics.pairwise from typing import Optional @@ -103,4 +105,15 @@ def __init__(self, definition): if "corpus" not in definition: definition["corpus"] = [] - self.comparator = CosineTextSimilarity(definition["corpus"]) + corpus = (doc for doc in definition["corpus"] if doc) + + self.vectorizer = sklearn.feature_extraction.text.TfidfVectorizer() + self.vectorizer.fit(corpus) + + self._cosine = sklearn.metrics.pairwise.cosine_similarity + + def comparator(self, field_1, field_2): + + return self._cosine( + self.vectorizer.transform([field_1]), self.vectorizer.transform([field_2]) + ) diff --git a/setup.py b/setup.py index d5a14e17f..183348967 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,6 @@ "numpy>=1.13", "doublemetaphone", "highered>=0.2.0", - "simplecosine>=1.2", "haversine>=0.4.1", "BTrees>=4.1.4", "zope.index",