Move predicate filtering out of datamodel

Part of the quest to remove the implementation details of predicates out of DataModel and into the things that actually care about them. This slightly changes the behavior in the test because we don't do any filtering either way, so we use ALL predicates from the variable definitions
dedupeio · Aug 11, 2022 · 37c0e36 · 37c0e36
1 parent 696018d
commit 37c0e36
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 10 deletions.
diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
@@ -69,17 +69,12 @@ def _field_comparators(
             yield (var.field, comparator, start, stop)
             start = stop
 
-    def predicates(self, canopies: bool = True) -> set[Predicate]:
+    @property
+    def predicates(self) -> set[Predicate]:
         predicates = set()
         for var in self.primary_variables:
             for predicate in var.predicates:
-                if hasattr(predicate, "index"):
-                    is_canopy = hasattr(predicate, "canopy")
-                    if is_canopy == canopies:
-                        predicates.add(predicate)
-                else:
-                    predicates.add(predicate)
-
+                predicates.add(predicate)
         return predicates
 
     def distances(

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -225,6 +225,20 @@ def _sample_indices(self, sample_size: int) -> Iterable[RecordIDPair]:
         return sample_ids
 
 
+def _filter_canopy_predicates(
+    predicates: Iterable[Predicate], canopies: bool
+) -> set[Predicate]:
+    result = set()
+    for predicate in predicates:
+        if hasattr(predicate, "index"):
+            is_canopy = hasattr(predicate, "canopy")
+            if is_canopy == canopies:
+                result.add(predicate)
+        else:
+            result.add(predicate)
+    return result
+
+
 class DedupeBlockLearner(BlockLearner):
     def __init__(
         self,
@@ -239,7 +253,8 @@ def __init__(
 
         index_data = sample_records(data, 50000)
         sampled_records = sample_records(index_data, N_SAMPLED_RECORDS)
-        preds = self.data_model.predicates()
+        preds = self.data_model.predicates
+        preds = _filter_canopy_predicates(preds, canopies=True)
 
         self.block_learner = training.DedupeBlockLearner(
             preds, sampled_records, index_data
@@ -293,7 +308,8 @@ def __init__(
         index_data = sample_records(data_2, 50000)
         sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS)
 
-        preds = self.data_model.predicates(canopies=False)
+        preds = self.data_model.predicates
+        preds = _filter_canopy_predicates(preds, canopies=False)
 
         self.block_learner = training.RecordLinkBlockLearner(
             preds, sampled_records_1, sampled_records_2, index_data