feat: Add ComparisonLevel compositions

This is a spin-off of #1096
moj-analytical-services · Mar 11, 2023 · 458e4d7 · 458e4d7
1 parent e0a2d5e
commit 458e4d7
Show file tree

Hide file tree

Showing 11 changed files with 447 additions and 10 deletions.
diff --git a/docs/comparison_level_composition.md b/docs/comparison_level_composition.md
@@ -0,0 +1,24 @@
+---
+tags:
+  - API
+  - comparisons
+---
+# Documentation for `comparison_level_composition` functions
+
+`comparison_composition` allows the merging of existing comparison levels by a logical SQL clause - `OR`, `AND` or `NOT`.
+
+This extends the functionality of our base comparison levels by allowing users to "join" existing comparisons by various SQL clauses.
+
+For example, `or_(null_level("first_name"), null_level("surname"))` creates a check for nulls in *either* `first_name` or `surname`, rather than restricting the user to a single column.
+
+The detailed API for each of these are outlined below.
+
+## Library comparison composition APIs
+
+::: splink.comparison_level_composition
+    handler: python
+    selection:
+      members:
+        - and_
+        - or_
+        - not_
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -84,6 +84,7 @@ nav:
       - Comparisons Library API:
           - Comparison Library: "comparison_library.md"
           - Comparison Level Library: "comparison_level_library.md"
+          - Comparison Composition: "comparison_level_composition.md"
   - Settings Editor: "settingseditor/editor.md"
   - Settings dictionary reference: "settings_dict_guide.md"
   - Tutorials:

diff --git a/splink/athena/athena_comparison_level_library.py b/splink/athena/athena_comparison_level_library.py
@@ -1,3 +1,4 @@
+from ..comparison_level_composition import and_, not_, or_  # noqa: F401
 from ..comparison_level_library import (
     ArrayIntersectLevelBase,
     ColumnsReversedLevelBase,

diff --git a/splink/comparison_level_composition.py b/splink/comparison_level_composition.py
@@ -0,0 +1,247 @@
+from __future__ import annotations
+
+from typing import Iterable
+
+from .comparison_level import ComparisonLevel
+
+
+def and_(
+    *clls: ComparisonLevel | dict,
+    label_for_charts=None,
+    m_probability=None,
+    is_null_level=None,
+) -> ComparisonLevel:
+    """Merge ComparisonLevels using logical "AND".
+
+    Merge multiple ComparisonLevels into a single ComparisonLevel by
+    merging their SQL conditions using a logical "AND".
+
+    By default, we generate a new `label_for_charts` for the new ComparisonLevel.
+    You can override this, and any other ComparisonLevel attributes, by passing
+    them as keyword arguments.
+
+    Args:
+        *clls (ComparisonLevel | dict): ComparisonLevels or comparison
+            level dictionaries to merge
+        label_for_charts (str, optional): A label for this comparson level,
+            which will appear on charts as a reminder of what the level represents.
+            Defaults to a composition of - `label_1 AND label_2`
+        m_probability (float, optional): Starting value for m probability.
+            Defaults to None.
+        is_null_level (bool, optional): If true, m and u values will not be
+            estimated and instead the match weight will be zero for this column.
+            Defaults to None.
+
+    Examples:
+        >>> # Simple null level composition with an `AND` clause
+        >>> import splink.duckdb.duckdb_comparison_level_library as cll
+        >>> cll.and_(cll.null_level("first_name"), cll.null_level("surname"))
+
+        >>> # Composing a levenshtein level with a custom `contains` level
+        >>> import splink.duckdb.duckdb_comparison_level_library as cll
+        >>> misspelling = cll.levenshtein_level("name", 1)
+        >>> contains = {
+        >>>     "sql_condition": "(contains(name_l, name_r) OR " \
+        >>>     "contains(name_r, name_l))"
+        >>> }
+        >>> merged = cll.and_(misspelling, contains, label_for_charts="Spelling error")
+        >>> merged.as_dict()
+        >>> {
+        >>>    'sql_condition': '(levenshtein("name_l", "name_r") <= 1) ' \
+        >>>    'AND ((contains(name_l, name_r) OR contains(name_r, name_l)))',
+        >>>    'label_for_charts': 'Spelling error'
+        >>> }
+
+    Returns:
+        ComparisonLevel: A new ComparisonLevel with the merged
+            SQL condition
+    """
+    return _cl_merge(
+        *clls,
+        clause="AND",
+        label_for_charts=label_for_charts,
+        m_probability=m_probability,
+        is_null_level=is_null_level,
+    )
+
+
+def or_(
+    *clls: ComparisonLevel | dict,
+    label_for_charts: str | None = None,
+    m_probability: float | None = None,
+    is_null_level: bool | None = None,
+) -> ComparisonLevel:
+    """Merge ComparisonLevels using logical "OR".
+
+    Merge multiple ComparisonLevels into a single ComparisonLevel by
+    merging their SQL conditions using a logical "OR".
+
+    By default, we generate a new `label_for_charts` for the new ComparisonLevel.
+    You can override this, and any other ComparisonLevel attributes, by passing
+    them as keyword arguments.
+
+    Args:
+        *clls (ComparisonLevel | dict): ComparisonLevels or comparison
+            level dictionaries to merge
+        label_for_charts (str, optional): A label for this comparson level,
+            which will appear on charts as a reminder of what the level represents.
+            Defaults to a composition of - `label_1 OR label_2`
+        m_probability (float, optional): Starting value for m probability.
+            Defaults to None.
+        is_null_level (bool, optional): If true, m and u values will not be
+            estimated and instead the match weight will be zero for this column.
+            Defaults to None.
+
+    Examples:
+        >>> # Simple null level composition with an `OR` clause
+        >>> import splink.duckdb.duckdb_comparison_level_library as cll
+        >>> cll.or_(cll.null_level("first_name"), cll.null_level("surname"))
+
+        >>> # Composing a levenshtein level with a custom `contains` level
+        >>> import splink.duckdb.duckdb_comparison_level_library as cll
+        >>> misspelling = cll.levenshtein_level("name", 1)
+        >>> contains = {
+        >>>     "sql_condition": "(contains(name_l, name_r) OR " \
+        >>>     "contains(name_r, name_l))"
+        >>> }
+        >>> merged = cll.or_(misspelling, contains, label_for_charts="Spelling error")
+        >>> merged.as_dict()
+        >>> {
+        >>>    'sql_condition': '(levenshtein("name_l", "name_r") <= 1) ' \
+        >>>    'OR ((contains(name_l, name_r) OR contains(name_r, name_l)))',
+        >>>    'label_for_charts': 'Spelling error'
+        >>> }
+
+    Returns:
+        ComparisonLevel: A new ComparisonLevel with the merged
+            SQL condition
+    """
+
+    return _cl_merge(
+        *clls,
+        clause="OR",
+        label_for_charts=label_for_charts,
+        m_probability=m_probability,
+        is_null_level=is_null_level,
+    )
+
+
+def not_(
+    cll: ComparisonLevel | dict,
+    label_for_charts: str | None = None,
+    m_probability: float | None = None,
+) -> ComparisonLevel:
+    """Negate a ComparisonLevel.
+
+    Returns a ComparisonLevel with the same SQL condition as the input,
+    but prefixed with "NOT".
+
+    By default, we generate a new `label_for_charts` for the new ComparisonLevel.
+    You can override this, and any other ComparisonLevel attributes, by passing
+    them as keyword arguments.
+
+    Args:
+        cll (ComparisonLevel | dict): ComparisonLevel or comparison
+            level dictionary
+        label_for_charts (str, optional): A label for this comparson level,
+            which will appear on charts as a reminder of what the level represents.
+        m_probability (float, optional): Starting value for m probability.
+            Defaults to None.
+
+    Examples:
+        >>> import splink.duckdb.duckdb_comparison_level_library as cll
+        >>> # *Not* a null on first name `first_name`
+        >>> cll.not_(cll.exact_match("first_name"))
+
+        >>> import splink.duckdb.duckdb_comparison_level_library as cll
+        >>> # Find all exact matches *not* on the first of January
+        >>> dob_first_jan =  {
+        >>>    "sql_condition": "SUBSTR(dob_std_l, -5) = '01-01'",
+        >>>    "label_for_charts": "Date is 1st Jan",
+        >>> }
+        >>> exact_match_not_first_jan = cll.and_(
+        >>>     cll.exact_match_level("dob"),
+        >>>     cll.not_(dob_first_jan),
+        >>>     label_for_charts = "Exact match and not the 1st Jan"
+        >>> )
+
+
+    Returns:
+        ComparisonLevel
+            A new ComparisonLevel with the negated SQL condition and label_for_charts
+    """
+    cls, sql_dialect = _parse_comparison_levels(cll)
+    cl = cls[0]
+    result = {}
+    result["sql_condition"] = f"NOT ({cl.sql_condition})"
+
+    # Invert if is_null_level.
+    # If NOT is_null_level, then we don't know if the inverted level is null or not
+    if not cl.is_null_level:
+        result["is_null_level"] = False
+
+    result["label_for_charts"] = (
+        label_for_charts if label_for_charts else f"NOT ({cl.label_for_charts})"
+    )
+
+    if m_probability:
+        result["m_probability"] = m_probability
+
+    return ComparisonLevel(result, sql_dialect=sql_dialect)
+
+
+def _cl_merge(
+    *clls: ComparisonLevel | dict,
+    clause: str,
+    label_for_charts: str | None = None,
+    m_probability: float | None = None,
+    is_null_level: bool | None = None,
+) -> ComparisonLevel:
+    if len(clls) == 0:
+        raise ValueError("Must provide at least one ComparisonLevel")
+
+    cls, sql_dialect = _parse_comparison_levels(*clls)
+    result = {}
+    conditions = ("(" + cl.sql_condition + ")" for cl in cls)
+    result["sql_condition"] = f" {clause} ".join(conditions)
+
+    # Set to null level if all supplied levels are "null levels"
+    if is_null_level is None:
+        if all(d.is_null_level for d in cls):
+            result["is_null_level"] = True
+
+    if label_for_charts:
+        result["label_for_charts"] = label_for_charts
+    else:
+        labels = ("(" + cl.label_for_charts + ")" for cl in cls)
+        result["label_for_charts"] = f" {clause} ".join(labels)
+
+    if m_probability:
+        result["m_probability"] = m_probability
+
+    return ComparisonLevel(result, sql_dialect=sql_dialect)
+
+
+def _parse_comparison_levels(
+    *cls: ComparisonLevel | dict,
+) -> tuple[list[ComparisonLevel], str | None]:
+    cls = [_to_comparison_level(cl) for cl in cls]
+    sql_dialect = _unify_sql_dialects(cls)
+    return cls, sql_dialect
+
+
+def _to_comparison_level(cl: ComparisonLevel | dict) -> ComparisonLevel:
+    if isinstance(cl, ComparisonLevel):
+        return cl
+    else:
+        return ComparisonLevel(cl)
+
+
+def _unify_sql_dialects(cls: Iterable[ComparisonLevel]) -> str | None:
+    sql_dialects = set(cl._sql_dialect for cl in cls)
+    sql_dialects.discard(None)
+    if len(sql_dialects) > 1:
+        raise ValueError("Cannot combine comparison levels with different SQL dialects")
+    elif len(sql_dialects) == 0:
+        return None
+    return sql_dialects.pop()
diff --git a/splink/duckdb/duckdb_comparison_level_library.py b/splink/duckdb/duckdb_comparison_level_library.py
@@ -1,3 +1,4 @@
+from ..comparison_level_composition import and_, not_, or_  # noqa: F401
 from ..comparison_level_library import (
     ArrayIntersectLevelBase,
     ColumnsReversedLevelBase,

diff --git a/splink/spark/spark_comparison_level_library.py b/splink/spark/spark_comparison_level_library.py
@@ -1,3 +1,4 @@
+from ..comparison_level_composition import and_, not_, or_  # noqa: F401
 from ..comparison_level_library import (
     ArrayIntersectLevelBase,
     ColumnsReversedLevelBase,

diff --git a/splink/sqlite/sqlite_comparison_level_library.py b/splink/sqlite/sqlite_comparison_level_library.py
@@ -1,3 +1,4 @@
+from ..comparison_level_composition import and_, not_, or_  # noqa: F401
 from ..comparison_level_library import (
     ColumnsReversedLevelBase,
     DistanceFunctionLevelBase,

diff --git a/tests/basic_settings.py b/tests/basic_settings.py
@@ -151,3 +151,31 @@ def get_settings_dict():
     }
 
     return deepcopy(settings)
+
+
+def name_comparison(cll, sn: str) -> dict:
+    """A comparison using first and surname levels composed together."""
+    return {
+        "output_column_name": "first_name_and_surname",
+        "comparison_levels": [
+            # Null level
+            cll.or_(cll.null_level("first_name"), cll.null_level(sn)),
+            # Exact match on fn and sn
+            cll.or_(
+                cll.exact_match_level("first_name"),
+                cll.exact_match_level(sn),
+                m_probability=0.8,
+                label_for_charts="Exact match on first name or surname",
+            ),
+            # (Levenshtein(fn) and jaro_winkler(fn)) or levenshtein(sur)
+            cll.and_(
+                cll.or_(
+                    cll.levenshtein_level("first_name", 2),
+                    cll.jaro_winkler_level("first_name", 0.8),
+                    m_probability=0.8,
+                ),
+                cll.levenshtein_level(sn, 3),
+            ),
+            cll.else_level(0.1),
+        ],
+    }