start to use sklearn for ml algorithms

rlrlearner -> rflearner update reqs use sklearn clustering update tests
dedupeio · May 6, 2022 · a193834 · a193834
1 parent 35ba047
commit a193834
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 35 deletions.
diff --git a/dedupe/api.py b/dedupe/api.py
@@ -16,7 +16,7 @@
 
 import numpy
 import json
-import rlr
+import sklearn.ensemble
 
 import dedupe.core as core
 import dedupe.serializer as serializer
@@ -1034,8 +1034,7 @@ class ActiveMatching(Matching):
     """
     Class for training a matcher.
     """
-
-    classifier = rlr.RegularizedLogisticRegression()
+    classifier = sklearn.ensemble.RandomForestClassifier()
 
     def __init__(
         self,

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -8,8 +8,7 @@
 import tempfile
 
 import numpy
-import fastcluster
-import hcluster
+import scipy.cluster.hierarchy
 
 from typing import Iterable, Dict, cast, List, Set, Generator, Sequence, Tuple
 from dedupe._typing import Clusters, RecordID, Links
@@ -217,11 +216,11 @@ def cluster(
 
             i_to_id, condensed_distances, N = condensedDistance(sub_graph)
 
-            linkage = fastcluster.linkage(
-                condensed_distances, method="centroid", preserve_input=True
+            linkage = scipy.cluster.hierarchy.linkage(
+                condensed_distances, method="centroid"
             )
 
-            partition = hcluster.fcluster(
+            partition = scipy.cluster.hierarchy.fcluster(
                 linkage, distance_threshold, criterion="distance"
             )
 

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -3,9 +3,9 @@
 import logging
 
 import numpy
-import rlr
 from typing import List
 from typing_extensions import Protocol
+import sklearn.ensemble
 
 import dedupe.core as core
 import dedupe.training as training
@@ -38,9 +38,9 @@ class HasDataModel(Protocol):
     data_model: datamodel.DataModel
 
 
-class RLRLearner(ActiveLearner, rlr.RegularizedLogisticRegression):
+class RFLearner(sklearn.ensemble.RandomForestClassifier, ActiveLearner):
     def __init__(self, data_model):
-        super().__init__(alpha=1)
+        super().__init__()
         self.data_model = data_model
         self._candidates: List[TrainingExample]
 
@@ -66,7 +66,7 @@ def fit(self, X, y):
         self.y = numpy.array(y)
         self.X = X
 
-        super().fit(self.X, self.y, cv=False)
+        super().fit(self.X, self.y)
 
     def fit_transform(self, pairs, y):
         self.fit(self.transform(pairs), y)
@@ -118,7 +118,7 @@ def _bias(self):
         return weighted_bias
 
     def candidate_scores(self):
-        return self.predict_proba(self.distances)
+        return self.predict_proba(self.distances)[:, 1].reshape(-1, 1)
 
     def __len__(self):
         return len(self.candidates)
@@ -312,7 +312,7 @@ def _sample(self, data_1, data_2, sample_size):
 
 class DisagreementLearner(ActiveLearner):
 
-    classifier: RLRLearner
+    classifier: RFLearner
     blocker: BlockLearner
     candidates: List[TrainingExample]
 
@@ -417,7 +417,7 @@ def __init__(
 
         self.candidates = self.blocker.candidates
 
-        self.classifier = RLRLearner(self.data_model)
+        self.classifier = RFLearner(self.data_model)
         self.classifier.candidates = self.candidates
 
         self._common_init()
@@ -449,7 +449,7 @@ def __init__(
         self.blocker = RecordLinkBlockLearner(data_model, data_1, data_2, index_include)
         self.candidates = self.blocker.candidates
 
-        self.classifier = RLRLearner(self.data_model)
+        self.classifier = RFLearner(self.data_model)
         self.classifier.candidates = self.candidates
 
         self._common_init()

diff --git a/setup.py b/setup.py
@@ -11,12 +11,10 @@
 from Cython.Build import cythonize
 
 install_requires = [
-    "fastcluster",
-    "dedupe-hcluster",
+    "scikit-learn",
     "affinegap>=1.3",
     "categorical-distance>=1.9",
     "dedupe-variable-datetime",
-    "rlr>=2.4.3",
     "numpy>=1.13",
     "doublemetaphone",
     "highered>=0.2.0",

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -2,10 +2,21 @@
 import random
 
 import numpy
+import scipy.special
 
 import dedupe
 
 
+class MockClassifier:
+    def __init__(self):
+
+        self.weight = 0
+        self.bias = 0
+
+    def predict_proba(self, examples):
+        return scipy.special.expit(examples * self.weight + self.bias)
+
+
 class ScoreDuplicates(unittest.TestCase):
     def setUp(self):
         random.seed(123)
@@ -39,8 +50,9 @@ def setUp(self):
 
         deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
         self.data_model = deduper.data_model
-        self.classifier = deduper.classifier
-        self.classifier.weights = [-1.0302742719650269]
+        self.classifier = MockClassifier()
+
+        self.classifier.weight = -1.0302742719650269
         self.classifier.bias = 4.76
 
         score_dtype = [("pairs", "<U192", 2), ("score", "f4")]
@@ -68,7 +80,7 @@ def test_score_duplicates(self):
         )
 
     def test_score_duplicates_with_zeros(self):
-        self.classifier.weights = [-1000]
+        self.classifier.weight = -1000
         self.classifier.bias = 1000
         self.records = iter(
             [

diff --git a/tests/test_labeler.py b/tests/test_labeler.py
@@ -20,26 +20,17 @@ def setUp(self):
     def test_AL(self):
         random.seed(1111111111110)
         original_N = len(SAMPLE)
-        active_learner = dedupe.labeler.RLRLearner(self.data_model)
+        active_learner = dedupe.labeler.RFLearner(self.data_model)
         active_learner.candidates = SAMPLE
         assert len(active_learner) == original_N
-        pair = active_learner.pop()
-        print(pair)
-        assert pair == (
-            {"name": "Willy", "age": "35"},
-            {"name": "William", "age": "35"},
-        )
 
+        active_learner.pop()
         assert len(active_learner) == original_N - 1
 
-        pair = active_learner.pop()
-        print(pair)
-        assert pair == ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"})
+        active_learner.pop()
         assert len(active_learner) == original_N - 2
 
-        pair = active_learner.pop()
-        assert pair == ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"})
-
+        active_learner.pop()
         assert len(active_learner) == original_N - 3
 
         active_learner.pop()