moj-analytical-services · zslade · May 10, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 24, 2023
diff --git a/splink/athena/athena_base.py b/splink/athena/athena_base.py
@@ -7,6 +7,12 @@ def size_array_intersect_sql(col_name_l, col_name_r):
  return f"cardinality(array_intersect({col_name_l}, {col_name_r}))"
 
 
+def regex_extract_sql(col_name, regex):
+ return f"""
+ regexp_extract({col_name}, '{regex}')
+ """
+
+
 class AthenaBase(DialectBase):
  @property
  def _sql_dialect(self):
@@ -19,3 +25,7 @@ def _levenshtein_name(self):
  @property
  def _size_array_intersect_function(self):
  return size_array_intersect_sql
+
+ @property
+ def _regex_extract_function(self):
+ return regex_extract_sql
diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py
diff --git a/splink/comparison_library.py b/splink/comparison_library.py
diff --git a/splink/comparison_library_utils.py b/splink/comparison_library_utils.py
@@ -67,6 +67,7 @@ def distance_threshold_comparison_levels(
  col_name: str,
  distance_function_name: str,
  distance_threshold_or_thresholds,
+ regex_extract: str = None,
  higher_is_more_similar: bool = True,
  m_probability_or_probabilities_thres: list = None,
 ):
@@ -94,7 +95,12 @@ def distance_threshold_comparison_levels(
  higher_is_more_similar = True
 
  # these function arguments hold for all cases.
- kwargs = dict(col_name=col_name, distance_threshold=thres, m_probability=m_prob)
+ kwargs = dict(
+ col_name=col_name,
+ distance_threshold=thres,
+ regex_extract=regex_extract,
+ m_probability=m_prob,
+ )
  # separate out the two that are only used
  # when we have a user-supplied function, rather than a predefined subclass
  # feels a bit hacky, but will do at least for time being

diff --git a/splink/comparison_template_library.py b/splink/comparison_template_library.py
@@ -21,6 +21,9 @@ class DateComparisonBase(Comparison):
  def __init__(
  self,
  col_name: str,
+ cast_strings_to_date: bool = False,
+ date_format: str = None,
+ invalid_dates_as_null: bool = False,
  include_exact_match_level: bool = True,
  term_frequency_adjustments: bool = False,
  separate_1st_january: bool = False,
@@ -36,8 +39,6 @@ def __init__(
  m_probability_or_probabilities_jw: float | list = None,
  m_probability_or_probabilities_datediff: float | list = None,
  m_probability_else: float = None,
- cast_strings_to_date: bool = False,
- date_format: str = None,
  ) -> Comparison:
  """A wrapper to generate a comparison for a date column the data in
  `col_name` with preselected defaults.
@@ -51,7 +52,19 @@ def __init__(
  - Anything else
 
  Args:
- col_name (str): The name of the column to compare
+ col_name (str): The name of the column to compare.
+ cast_strings_to_date (bool, optional): Set to True to
+ enable date-casting when input dates are strings. Also adjust
+ date_format if date-strings are not in (yyyy-mm-dd) format.
+ Defaults to False.
+ date_format (str, optional): Format of input dates if date-strings
+ are given. Must be consistent across record pairs. If None
+ (the default), downstream functions for each backend assign
+ date_format to ISO 8601 format (yyyy-mm-dd).
+ Set to "yyyy-MM-dd" for Spark and "%Y-%m-%d" for DuckDB
+ when invalid_dates_as_null=True
+ invalid_dates_as_null (bool, optional): assign any dates that do not adhere
+ to date_format to the null level. Defaults to False.
  include_exact_match_level (bool, optional): If True, include an exact match
  level. Defaults to True.
  term_frequency_adjustments (bool, optional): If True, apply term frequency
@@ -95,14 +108,6 @@ def __init__(
  for the datediff thresholds specified. Defaults to None.
  m_probability_else (_type_, optional): If provided, overrides the
  default m probability for the 'anything else' level. Defaults to None.
- cast_strings_to_date (bool, optional): Set to True to
- enable date-casting when input dates are strings. Also adjust
- date_format if date-strings are not in (yyyy-mm-dd) format.
- Defaults to False.
- date_format(str, optional): Format of input dates if date-strings
- are given. Must be consistent across record pairs. If None
- (the default), downstream functions for each backend assign
- date_format to ISO 8601 format (yyyy-mm-dd).
 
  Examples:
  === "DuckDB"
@@ -120,6 +125,15 @@ def __init__(
  datediff_thresholds=[1, 1],
  datediff_metrics=["month", "year"])
  ```
+ Date Comparison casting columns date and assigning values that do not
+ match the date_format to the null level
+ ``` python
+ import splink.duckdb.duckdb_comparison_template_library as ctl
+ ctl.date_comparison("date_of_birth",
+ cast_strings_to_date=True,
+ date_format='%d/%m/%Y',
+ invalid_dates_as_null=True)
+ ```
  === "Spark"
  Basic Date Comparison
  ``` python
@@ -135,14 +149,25 @@ def __init__(
  datediff_thresholds=[1, 1],
  datediff_metrics=["month", "year"])
  ```
-
+ Date Comparison casting columns date and assigning values that do not
+ match the date_format to the null level
+ ``` python
+ import splink.spark.spark_comparison_template_library as ctl
+ ctl.date_comparison("date_of_birth",
+ cast_strings_to_date=True,
+ date_format='dd/mm/yyyy',
+ invalid_dates_as_null=True)
+ ```
  Returns:
  Comparison: A comparison that can be inclued in the Splink settings
  dictionary.
  """
  # Construct Comparison
  comparison_levels = []
- comparison_levels.append(self._null_level(col_name))
+ if invalid_dates_as_null:
+ comparison_levels.append(self._null_level(col_name, date_format))
+ else:
+ comparison_levels.append(self._null_level(col_name))
 
  # Validate user inputs
  datediff_error_logger(thresholds=datediff_thresholds, metrics=datediff_metrics)
@@ -172,9 +197,9 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "levenshtein",
- levenshtein_thresholds,
- m_probability_or_probabilities_lev,
+ distance_function_name="levenshtein",
+ distance_threshold_or_thresholds=levenshtein_thresholds,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_lev,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 
@@ -183,9 +208,9 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "jaro",
- jaro_thresholds,
- m_probability_or_probabilities_jar,
+ distance_function_name="jaro",
+ distance_threshold_or_thresholds=jaro_thresholds,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_jar,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 
@@ -194,9 +219,9 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "jaro-winkler",
- jaro_winkler_thresholds,
- m_probability_or_probabilities_jw,
+ distance_function_name="jaro-winkler",
+ distance_threshold_or_thresholds=jaro_winkler_thresholds,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_jw,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 
@@ -292,6 +317,7 @@ class NameComparisonBase(Comparison):
  def __init__(
  self,
  col_name: str,
+ regex_extract: str = None,
  include_exact_match_level: bool = True,
  phonetic_col_name: str = None,
  term_frequency_adjustments_name: bool = False,
@@ -318,7 +344,8 @@ def __init__(
  - Anything else
 
  Args:
- col_name (str): The name of the column to compare
+ col_name (str): The name of the column to compare.
+ regex_extract (str): Regular expression pattern to evaluate a match on.
  include_exact_match_level (bool, optional): If True, include an exact match
  level for col_name. Defaults to True.
  phonetic_col_name (str): The name of the column with phonetic reduction
@@ -419,6 +446,7 @@ def __init__(
  term_frequency_adjustments=term_frequency_adjustments_name,
  m_probability=m_probability_exact_match_name,
  include_colname_in_charts_label=True,
+ regex_extract=regex_extract,
  )
  comparison_levels.append(comparison_level)
 
@@ -428,6 +456,7 @@ def __init__(
  term_frequency_adjustments=term_frequency_adjustments_phonetic_name,
  m_probability=m_probability_exact_match_phonetic_name,
  include_colname_in_charts_label=True,
+ regex_extract=regex_extract,
  )
  comparison_levels.append(comparison_level)
 
@@ -436,9 +465,10 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "levenshtein",
- levenshtein_thresholds,
- m_probability_or_probabilities_lev,
+ distance_function_name="levenshtein",
+ distance_threshold_or_thresholds=levenshtein_thresholds,
+ regex_extract=regex_extract,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_lev,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 
@@ -447,9 +477,10 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "jaro",
- jaro_thresholds,
- m_probability_or_probabilities_jar,
+ distance_function_name="jaro",
+ distance_threshold_or_thresholds=jaro_thresholds,
+ regex_extract=regex_extract,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_jar,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 
@@ -458,9 +489,10 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "jaro-winkler",
- jaro_winkler_thresholds,
- m_probability_or_probabilities_jw,
+ distance_function_name="jaro-winkler",
+ distance_threshold_or_thresholds=jaro_winkler_thresholds,
+ regex_extract=regex_extract,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_jw,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 
@@ -469,9 +501,10 @@ def __init__(
  threshold_comparison_levels = distance_threshold_comparison_levels(
  self,
  col_name,
- "jaccard",
- jaccard_thresholds,
- m_probability_or_probabilities_jar,
+ distance_function_name="jaccard",
+ distance_threshold_or_thresholds=jaccard_thresholds,
+ regex_extract=regex_extract,
+ m_probability_or_probabilities_thres=m_probability_or_probabilities_jar,
  )
  comparison_levels = comparison_levels + threshold_comparison_levels
 

diff --git a/splink/dialect_base.py b/splink/dialect_base.py
@@ -26,6 +26,12 @@ def _datediff_function(self):
  f"comparisons/comparison levels?"
  )
 
+ @property
+ def _regex_extract_function(self):
+ raise NotImplementedError(
+ "Regex extract option not defined for " "the SQL backend being used. "
+ )
+
  @property
  def _levenshtein_name(self):
  return "levenshtein"

diff --git a/splink/duckdb/duckdb_base.py b/splink/duckdb/duckdb_base.py
@@ -35,6 +35,12 @@ def datediff_sql(
  """
 
 
+def regex_extract_sql(col_name, regex):
+ return f"""
+ regexp_extract({col_name}, '{regex}')
+ """
+
+
 class DuckDBBase(DialectBase):
  @property
  def _sql_dialect(self):
@@ -48,6 +54,10 @@ def _size_array_intersect_function(self):
  def _datediff_function(self):
  return datediff_sql
 
+ @property
+ def _regex_extract_function(self):
+ return regex_extract_sql
+
  @property
  def _jaro_name(self):
  return "jaro_similarity"

diff --git a/splink/spark/spark_base.py b/splink/spark/spark_base.py
@@ -44,6 +44,20 @@ def datediff_sql(
  """
 
 
+def regex_extract_sql(col_name, regex):
+ if "\\" in regex:
+ raise SyntaxError(
+ "Regular expressions containing “\\” (the python escape character) "
+ "are not compatible with Splink’s Spark linker. "
+ "Please consider using alternative syntax, "
+ "for example replacing “\\d” with “[0-9]”."
+ )
+ else:
+ return f"""
+ regexp_extract({col_name}, '{regex}', 0)
+ """
+
+
 class SparkBase(DialectBase):
  @property
  def _sql_dialect(self):
@@ -57,6 +71,10 @@ def _datediff_function(self):
  def _size_array_intersect_function(self):
  return size_array_intersect_sql
 
+ @property
+ def _regex_extract_function(self):
+ return regex_extract_sql
+
  @property
  def _jaro_name(self):
  return "jaro_sim"