Skip to content

Commit

Permalink
Move predicate filtering out of datamodel
Browse files Browse the repository at this point in the history
Part of the quest to remove the implementation details
of predicates out of DataModel and into the things that
actually care about them.

This slightly changes the behavior in the test because we don't
do any filtering either way, so we use ALL predicates from the
variable definitions
  • Loading branch information
NickCrews committed Aug 11, 2022
1 parent 696018d commit 37c0e36
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 10 deletions.
11 changes: 3 additions & 8 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,12 @@ def _field_comparators(
yield (var.field, comparator, start, stop)
start = stop

def predicates(self, canopies: bool = True) -> set[Predicate]:
@property
def predicates(self) -> set[Predicate]:
predicates = set()
for var in self.primary_variables:
for predicate in var.predicates:
if hasattr(predicate, "index"):
is_canopy = hasattr(predicate, "canopy")
if is_canopy == canopies:
predicates.add(predicate)
else:
predicates.add(predicate)

predicates.add(predicate)
return predicates

def distances(
Expand Down
20 changes: 18 additions & 2 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,20 @@ def _sample_indices(self, sample_size: int) -> Iterable[RecordIDPair]:
return sample_ids


def _filter_canopy_predicates(
predicates: Iterable[Predicate], canopies: bool
) -> set[Predicate]:
result = set()
for predicate in predicates:
if hasattr(predicate, "index"):
is_canopy = hasattr(predicate, "canopy")
if is_canopy == canopies:
result.add(predicate)
else:
result.add(predicate)
return result


class DedupeBlockLearner(BlockLearner):
def __init__(
self,
Expand All @@ -239,7 +253,8 @@ def __init__(

index_data = sample_records(data, 50000)
sampled_records = sample_records(index_data, N_SAMPLED_RECORDS)
preds = self.data_model.predicates()
preds = self.data_model.predicates
preds = _filter_canopy_predicates(preds, canopies=True)

self.block_learner = training.DedupeBlockLearner(
preds, sampled_records, index_data
Expand Down Expand Up @@ -293,7 +308,8 @@ def __init__(
index_data = sample_records(data_2, 50000)
sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS)

preds = self.data_model.predicates(canopies=False)
preds = self.data_model.predicates
preds = _filter_canopy_predicates(preds, canopies=False)

self.block_learner = training.RecordLinkBlockLearner(
preds, sampled_records_1, sampled_records_2, index_data
Expand Down

0 comments on commit 37c0e36

Please sign in to comment.