Skip to content

Commit

Permalink
start to use sklearn for ml algorithms
Browse files Browse the repository at this point in the history
rlrlearner -> rflearner

update reqs

use sklearn clustering

update tests
  • Loading branch information
fgregg committed May 6, 2022
1 parent 35ba047 commit a193834
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 35 deletions.
5 changes: 2 additions & 3 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import numpy
import json
import rlr
import sklearn.ensemble

import dedupe.core as core
import dedupe.serializer as serializer
Expand Down Expand Up @@ -1034,8 +1034,7 @@ class ActiveMatching(Matching):
"""
Class for training a matcher.
"""

classifier = rlr.RegularizedLogisticRegression()
classifier = sklearn.ensemble.RandomForestClassifier()

def __init__(
self,
Expand Down
9 changes: 4 additions & 5 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import tempfile

import numpy
import fastcluster
import hcluster
import scipy.cluster.hierarchy

from typing import Iterable, Dict, cast, List, Set, Generator, Sequence, Tuple
from dedupe._typing import Clusters, RecordID, Links
Expand Down Expand Up @@ -217,11 +216,11 @@ def cluster(

i_to_id, condensed_distances, N = condensedDistance(sub_graph)

linkage = fastcluster.linkage(
condensed_distances, method="centroid", preserve_input=True
linkage = scipy.cluster.hierarchy.linkage(
condensed_distances, method="centroid"
)

partition = hcluster.fcluster(
partition = scipy.cluster.hierarchy.fcluster(
linkage, distance_threshold, criterion="distance"
)

Expand Down
16 changes: 8 additions & 8 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import logging

import numpy
import rlr
from typing import List
from typing_extensions import Protocol
import sklearn.ensemble

import dedupe.core as core
import dedupe.training as training
Expand Down Expand Up @@ -38,9 +38,9 @@ class HasDataModel(Protocol):
data_model: datamodel.DataModel


class RLRLearner(ActiveLearner, rlr.RegularizedLogisticRegression):
class RFLearner(sklearn.ensemble.RandomForestClassifier, ActiveLearner):
def __init__(self, data_model):
super().__init__(alpha=1)
super().__init__()
self.data_model = data_model
self._candidates: List[TrainingExample]

Expand All @@ -66,7 +66,7 @@ def fit(self, X, y):
self.y = numpy.array(y)
self.X = X

super().fit(self.X, self.y, cv=False)
super().fit(self.X, self.y)

def fit_transform(self, pairs, y):
self.fit(self.transform(pairs), y)
Expand Down Expand Up @@ -118,7 +118,7 @@ def _bias(self):
return weighted_bias

def candidate_scores(self):
return self.predict_proba(self.distances)
return self.predict_proba(self.distances)[:, 1].reshape(-1, 1)

def __len__(self):
return len(self.candidates)
Expand Down Expand Up @@ -312,7 +312,7 @@ def _sample(self, data_1, data_2, sample_size):

class DisagreementLearner(ActiveLearner):

classifier: RLRLearner
classifier: RFLearner
blocker: BlockLearner
candidates: List[TrainingExample]

Expand Down Expand Up @@ -417,7 +417,7 @@ def __init__(

self.candidates = self.blocker.candidates

self.classifier = RLRLearner(self.data_model)
self.classifier = RFLearner(self.data_model)
self.classifier.candidates = self.candidates

self._common_init()
Expand Down Expand Up @@ -449,7 +449,7 @@ def __init__(
self.blocker = RecordLinkBlockLearner(data_model, data_1, data_2, index_include)
self.candidates = self.blocker.candidates

self.classifier = RLRLearner(self.data_model)
self.classifier = RFLearner(self.data_model)
self.classifier.candidates = self.candidates

self._common_init()
Expand Down
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@
from Cython.Build import cythonize

install_requires = [
"fastcluster",
"dedupe-hcluster",
"scikit-learn",
"affinegap>=1.3",
"categorical-distance>=1.9",
"dedupe-variable-datetime",
"rlr>=2.4.3",
"numpy>=1.13",
"doublemetaphone",
"highered>=0.2.0",
Expand Down
18 changes: 15 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,21 @@
import random

import numpy
import scipy.special

import dedupe


class MockClassifier:
def __init__(self):

self.weight = 0
self.bias = 0

def predict_proba(self, examples):
return scipy.special.expit(examples * self.weight + self.bias)


class ScoreDuplicates(unittest.TestCase):
def setUp(self):
random.seed(123)
Expand Down Expand Up @@ -39,8 +50,9 @@ def setUp(self):

deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
self.data_model = deduper.data_model
self.classifier = deduper.classifier
self.classifier.weights = [-1.0302742719650269]
self.classifier = MockClassifier()

self.classifier.weight = -1.0302742719650269
self.classifier.bias = 4.76

score_dtype = [("pairs", "<U192", 2), ("score", "f4")]
Expand Down Expand Up @@ -68,7 +80,7 @@ def test_score_duplicates(self):
)

def test_score_duplicates_with_zeros(self):
self.classifier.weights = [-1000]
self.classifier.weight = -1000
self.classifier.bias = 1000
self.records = iter(
[
Expand Down
17 changes: 4 additions & 13 deletions tests/test_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,17 @@ def setUp(self):
def test_AL(self):
random.seed(1111111111110)
original_N = len(SAMPLE)
active_learner = dedupe.labeler.RLRLearner(self.data_model)
active_learner = dedupe.labeler.RFLearner(self.data_model)
active_learner.candidates = SAMPLE
assert len(active_learner) == original_N
pair = active_learner.pop()
print(pair)
assert pair == (
{"name": "Willy", "age": "35"},
{"name": "William", "age": "35"},
)

active_learner.pop()
assert len(active_learner) == original_N - 1

pair = active_learner.pop()
print(pair)
assert pair == ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"})
active_learner.pop()
assert len(active_learner) == original_N - 2

pair = active_learner.pop()
assert pair == ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"})

active_learner.pop()
assert len(active_learner) == original_N - 3

active_learner.pop()
Expand Down

0 comments on commit a193834

Please sign in to comment.