Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

start to use sklearn for ml algorithms #992

Merged
merged 32 commits into from
Jun 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a193834
start to use sklearn for ml algorithms
fgregg Apr 26, 2022
8bd2022
continue on error for benchmarks
fgregg May 5, 2022
3c34d99
use sklearn for cosine distances
fgregg May 6, 2022
513eb12
fixup: Format Python code with Black
cclauss May 6, 2022
576a6dc
use better autoblackener
fgregg May 6, 2022
9bc5d8b
Revert "use sklearn for cosine distances"
fgregg May 6, 2022
fdb2bdd
Merge branch 'main' into sklearn_depend
fgregg May 6, 2022
9dd5a8a
increase timeout, closes #1008
fgregg May 6, 2022
c7f18f3
merge conflict
fgregg May 6, 2022
882f52a
Merge branch 'big_sample' into sklearn_depend
fgregg May 21, 2022
a87e770
Merge branch 'main' into sklearn_depend
fgregg May 25, 2022
8185e83
Merge branch 'main' into sklearn_depend
fgregg May 26, 2022
30cbb2e
Merge branch 'main' into sklearn_depend
fgregg May 26, 2022
3ef56a3
Merge branch 'main' into sklearn_depend
fgregg May 26, 2022
00dc6ee
Merge branch 'main' into sklearn_depend
fgregg May 26, 2022
00ba1c6
Merge branch 'sklearn_depend' of github.com:dedupeio/dedupe into skle…
fgregg May 26, 2022
c0e940b
Merge branch 'main' into sklearn_depend
fgregg May 26, 2022
71263e2
Merge branch 'main' into sklearn_depend
fgregg May 26, 2022
718f949
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
fae17d3
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
8b3ddb7
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
8876c04
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
921d47e
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
9819003
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
adbb8c8
Merge branch 'main' into sklearn_depend
fgregg May 27, 2022
955665f
try cross validation rf
fgregg May 27, 2022
bab573c
Merge branch 'sklearn_depend' of github.com:dedupeio/dedupe into skle…
fgregg May 27, 2022
9412c74
fixup! Format Python code with psf/black pull_request
fgregg May 27, 2022
52ffcf6
use regularized logistic regression
fgregg Jun 2, 2022
90efd62
rlrlearner
fgregg Jun 2, 2022
02b95e8
rlrlearner
fgregg Jun 2, 2022
ca861b9
good error message if rlr is missing
fgregg Jun 2, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,31 @@ jobs:
# so run this before anything else.
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: "3.10"
- name: Install Black
run: pip install black
- name: Run black --check .
run: black --check .
- uses: actions/checkout@v2
with: # https://github.com/stefanzweifel/git-auto-commit-action#checkout-the-correct-branch
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2
- run: pip install black
- run: black --check .
- name: If needed, commit black changes to the pull request
if: failure()
run: |
black .
git config --global user.name 'autoblack'
git config --global user.email 'cclauss@users.noreply.github.com'
printenv | grep GITHUB
git config --global user.name 'fgregg'
git config --global user.email 'fgregg@users.noreply.github.com'
git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY
git checkout $GITHUB_HEAD_REF
git commit -am "fixup: Format Python code with Black"
git remote -v
git branch
git status
black .
git status
echo ready to commit
git commit -am "fixup! Format Python code with psf/black pull_request"
echo ready to push
git push
test:
needs: format
timeout-minutes: 30
timeout-minutes: 40
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down
24 changes: 22 additions & 2 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import tempfile

import numpy
import rlr
import sklearn.linear_model
import sklearn.model_selection

import dedupe.core as core
import dedupe.serializer as serializer
Expand Down Expand Up @@ -1016,6 +1017,19 @@ def __init__(
"the current version of dedupe. This can happen "
"if you have recently upgraded dedupe."
)
except ModuleNotFoundError as exc:
if "No module named 'rlr'" in str(exc):
raise SettingsFileLoadingException(
"This settings file was created with a previous "
"version of dedupe that used the 'rlr' library. "
"To continue to use this settings file, you need "
"install that library: `pip install rlr`"
)
else:
raise SettingsFileLoadingException(
"Something has gone wrong with loading the settings file. "
"Try deleting the file"
) from exc
except: # noqa: E722
raise SettingsFileLoadingException(
"Something has gone wrong with loading the settings file. "
Expand All @@ -1034,7 +1048,13 @@ class ActiveMatching(Matching):
Class for training a matcher.
"""

classifier = rlr.RegularizedLogisticRegression()
classifier = sklearn.model_selection.GridSearchCV(
estimator=sklearn.linear_model.LogisticRegression(),
param_grid={"C": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
scoring="f1",
verbose=3,
n_jobs=-1,
)

def __init__(
self,
Expand Down
9 changes: 4 additions & 5 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import tempfile

import numpy
import fastcluster
import hcluster
import scipy.cluster.hierarchy

from typing import Iterable, Dict, cast, List, Set, Generator, Sequence, Tuple
from dedupe._typing import Clusters, RecordID, Links
Expand Down Expand Up @@ -238,11 +237,11 @@ def cluster(

i_to_id, condensed_distances, N = condensedDistance(sub_graph)

linkage = fastcluster.linkage(
condensed_distances, method="centroid", preserve_input=True
linkage = scipy.cluster.hierarchy.linkage(
condensed_distances, method="centroid"
)

partition = hcluster.fcluster(
partition = scipy.cluster.hierarchy.fcluster(
linkage, distance_threshold, criterion="distance"
)

Expand Down
10 changes: 5 additions & 5 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import logging

import numpy
import rlr
from typing import List
from typing_extensions import Protocol
import sklearn.linear_model

import dedupe.core as core
import dedupe.training as training
Expand Down Expand Up @@ -38,9 +38,9 @@ class HasDataModel(Protocol):
data_model: datamodel.DataModel


class RLRLearner(ActiveLearner, rlr.RegularizedLogisticRegression):
class RLRLearner(sklearn.linear_model.LogisticRegression, ActiveLearner):
def __init__(self, data_model):
super().__init__(alpha=1)
super().__init__()
self.data_model = data_model
self._candidates: List[TrainingExample]

Expand All @@ -66,7 +66,7 @@ def fit(self, X, y):
self.y = numpy.array(y)
self.X = X

super().fit(self.X, self.y, cv=False)
super().fit(self.X, self.y)

def fit_transform(self, pairs, y):
self.fit(self.transform(pairs), y)
Expand Down Expand Up @@ -118,7 +118,7 @@ def _bias(self):
return weighted_bias

def candidate_scores(self):
return self.predict_proba(self.distances)
return self.predict_proba(self.distances)[:, 1].reshape(-1, 1)

def __len__(self):
return len(self.candidates)
Expand Down
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@
from Cython.Build import cythonize

install_requires = [
"fastcluster",
"dedupe-hcluster",
"scikit-learn",
"affinegap>=1.3",
"categorical-distance>=1.9",
"dedupe-variable-datetime",
"rlr>=2.4.3",
"numpy>=1.13",
"doublemetaphone",
"highered>=0.2.0",
Expand Down
18 changes: 15 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,21 @@
import random

import numpy
import scipy.special

import dedupe


class MockClassifier:
def __init__(self):

self.weight = 0
self.bias = 0

def predict_proba(self, examples):
return scipy.special.expit(examples * self.weight + self.bias)


class ScoreDuplicates(unittest.TestCase):
def setUp(self):
random.seed(123)
Expand Down Expand Up @@ -39,8 +50,9 @@ def setUp(self):

deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
self.data_model = deduper.data_model
self.classifier = deduper.classifier
self.classifier.weights = [-1.0302742719650269]
self.classifier = MockClassifier()

self.classifier.weight = -1.0302742719650269
self.classifier.bias = 4.76

score_dtype = [("pairs", "<U192", 2), ("score", "f4")]
Expand Down Expand Up @@ -68,7 +80,7 @@ def test_score_duplicates(self):
)

def test_score_duplicates_with_zeros(self):
self.classifier.weights = [-1000]
self.classifier.weight = -1000
self.classifier.bias = 1000
self.records = iter(
[
Expand Down
15 changes: 3 additions & 12 deletions tests/test_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,14 @@ def test_AL(self):
active_learner = dedupe.labeler.RLRLearner(self.data_model)
active_learner.candidates = SAMPLE
assert len(active_learner) == original_N
pair = active_learner.pop()
print(pair)
assert pair == (
{"name": "Willy", "age": "35"},
{"name": "William", "age": "35"},
)

active_learner.pop()
assert len(active_learner) == original_N - 1

pair = active_learner.pop()
print(pair)
assert pair == ({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"})
active_learner.pop()
assert len(active_learner) == original_N - 2

pair = active_learner.pop()
assert pair == ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"})

active_learner.pop()
assert len(active_learner) == original_N - 3

active_learner.pop()
Expand Down